(nutch) branch master updated: NUTCH-3029
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 98902236d NUTCH-3029 98902236d is described below commit 98902236d782615ea1b8676a477bfa735499810a Author: Markus Jelsma AuthorDate: Thu Mar 14 10:49:34 2024 + NUTCH-3029 --- src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 64719cdae..8ee957c09 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -192,7 +192,7 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { * * @param url url to get hostname for * @return hostname - * @throws URISyntaxException + * @throws URISyntaxException if the given string violates RFC 2396 */ public static String getHostName(String url) throws URISyntaxException { URI uri = new URI(url);
(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new a8ec17ca8 NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler a8ec17ca8 is described below commit a8ec17ca853b2488bf5d96538915a00a05064a31 Author: Markus Jelsma AuthorDate: Wed Mar 13 18:35:22 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 4d4a3af73..64719cdae 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -192,6 +192,7 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { * * @param url url to get hostname for * @return hostname + * @throws URISyntaxException */ public static String getHostName(String url) throws URISyntaxException { URI uri = new URI(url);
(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 84cda2abd NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler 84cda2abd is described below commit 84cda2abd500667222fdb00e503780ee0bdaaab4 Author: Markus Jelsma AuthorDate: Wed Mar 13 16:12:21 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- .../org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index a403d5649..4d4a3af73 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -189,6 +189,9 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { /** * Strip a URL, leaving only the host name. + * + * @param url url to get hostname for + * @return hostname */ public static String getHostName(String url) throws URISyntaxException { URI uri = new URI(url); @@ -198,9 +201,10 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { /** * Returns the max_interval for this URL, which might depend on the host. - * @param url the URL to be scheduled - * @param defaultMaxInterval the value to which to default - * if max_interval has not been configured for this host + * + * @param url the URL to be scheduled + * @param defaultMaxInterval the value to which to default if max_interval has not been configured for this host + * @return the configured maximum interval or the default interval */ public float getMaxInterval(Text url, float defaultMaxInterval){ if (hostSpecificMaxInterval.isEmpty()) { @@ -220,9 +224,10 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { /** * Returns the min_interval for this URL, which might depend on the host. - * @param url the URL to be scheduled - * @param defaultMinInterval the value to which to default - * if min_interval has not been configured for this host + * + * @param url the URL to be scheduled + * @param defaultMinInterval the value to which to default if min_interval has not been configured for this host + * @return the configured minimum interval or the default interval */ public float getMinInterval(Text url, float defaultMinInterval){ if (hostSpecificMinInterval.isEmpty()) {
(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 5ba50c0c6 NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler 5ba50c0c6 is described below commit 5ba50c0c6091a95818d3788f0d5b7c0ff49bec57 Author: Markus Jelsma AuthorDate: Wed Mar 13 14:53:10 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- .../apache/nutch/crawl/AdaptiveFetchSchedule.java | 159 - 1 file changed, 155 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 5bccd4f30..a403d5649 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -22,11 +22,20 @@ import org.apache.hadoop.io.FloatWritable; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.util.NutchConfiguration; +import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.Reader; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.IOException; +import java.util.Map; +import java.util.HashMap; import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.net.URISyntaxException; /** * This class implements an adaptive re-fetch algorithm. This works as follows: @@ -79,9 +88,16 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { private double SYNC_DELTA_RATE; + private Configuration conf; + + private Map hostSpecificMaxInterval = new HashMap<>(); + + private Map hostSpecificMinInterval = new HashMap<>(); + @Override public void setConf(Configuration conf) { super.setConf(conf); +this.conf = conf; if (conf == null) return; INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); @@ -92,6 +108,136 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true); SYNC_DELTA_RATE = conf.getFloat( "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f); +try { + setHostSpecificIntervals("adaptive-host-specific-intervals.txt", +MIN_INTERVAL, MAX_INTERVAL); +} catch (IOException e){ + LOG.error("Failed reading the configuration file. ", e); +} + } + + /** + * Load host-specific min_intervals and max_intervals + * from the configuration file into the HashMaps. + */ + private void setHostSpecificIntervals(String fileName, +float defaultMin, float defaultMax) throws IOException { +Reader configReader = null; +configReader = conf.getConfResourceAsReader(fileName); +if (configReader == null) { + configReader = new FileReader(fileName); +} +BufferedReader reader = new BufferedReader(configReader); +String line; +int lineNo = 0; +while ((line = reader.readLine()) != null) { + lineNo++; + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { +line = line.trim(); +String[] parts = line.split("\\s+"); +if (parts.length == 3) { + // TODO: Maybe add host validatio here? + // It might get computationally expensive for large files, though. + String host = parts[0].trim().toLowerCase(); + String minInt = parts[1].trim(); + String maxInt = parts[2].trim(); + if (minInt.equalsIgnoreCase("default")){ minInt = "0"; } + if (maxInt.equalsIgnoreCase("default")){ maxInt = "0"; } + float m,M; + try { +m = Float.parseFloat(minInt); +M = Float.parseFloat(maxInt); + +//negative values and mismatched boundaries are ignored +//(default to global settings) +if (m < 0 || M < 0 || m > M){ + LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo) ++ " in the config. file: " + line); +} else { + + // min. interval should be positive and above the global minimum + if (m > 0 && m > defaultMin){ + hostSpecificMinInterval.put(host,m); + LOG.debug("Added custom min. interval " + m + " for host " + host + "."); + } else if (m > 0) { +LOG.error("Min. interval out of bounds on line " + String.valueOf(lineNo) + + " in the config. file: " + line); + } + +
(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 4642c30c2 NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler 4642c30c2 is described below commit 4642c30c2aeb2a1fa2436541bd4af877d0aad86a Author: Markus Jelsma AuthorDate: Wed Mar 13 12:58:05 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- conf/adaptive-host-specific-intervals.txt.template | 14 ++ 1 file changed, 14 insertions(+) diff --git a/conf/adaptive-host-specific-intervals.txt.template b/conf/adaptive-host-specific-intervals.txt.template new file mode 100644 index 0..4aa7920d3 --- /dev/null +++ b/conf/adaptive-host-specific-intervals.txt.template @@ -0,0 +1,14 @@ +# This file defines a mapping that associates specific min. and max. refetching time intervals +# to a host, that deviate from the default settings of the AdaptiveFetchSchedule class. +# +# Format:. +# +# The two values will be parsed as float and should be STRICTLY between +# db.fetch.schedule.adaptive.min_interval and db.fetch.schedule.adaptive.max_interval. +# +# To use default values, write "default" or "0". +# The default min. is 60 (1 min) and default max. is 31536000 (1 year). +# +www.apache.org default 1728000 +www.example.org 1296000 0 +nutch.apache.org 864000 216
(nutch) branch master updated: NUTCH-3030 Use system default cipher suites instead of hard-coded set
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 551c50b1c NUTCH-3030 Use system default cipher suites instead of hard-coded set 551c50b1c is described below commit 551c50b1caac27ae65f25517de5b202b314fef0e Author: Markus Jelsma AuthorDate: Wed Mar 13 11:50:25 2024 + NUTCH-3030 Use system default cipher suites instead of hard-coded set --- .../apache/nutch/protocol/http/api/HttpBase.java | 63 +- 1 file changed, 12 insertions(+), 51 deletions(-) diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 1438754ce..034fa7840 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -31,6 +31,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ThreadLocalRandom; +import javax.net.ssl.SSLSocketFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -320,57 +321,17 @@ public abstract class HttpBase implements Protocol { } } -String[] protocols = conf.getStrings("http.tls.supported.protocols", -"TLSv1.3", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); -String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", -"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-GCM-SHA256", -"ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384", -"ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-CHACHA20-POLY1305", -"DHE-RSA-AES128-GCM-SHA256", "DHE-RSA-AES256-GCM-SHA384", -"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", -"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", -"TLS_RSA_WITH_AES_256_CBC_SHA256", -"TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384", -"TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", -"TLS_DHE_RSA_WITH_AES_256_CBC_SHA256", -"TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", -"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", -"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA", -"TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", -"TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", -"TLS_DHE_DSS_WITH_AES_256_CBC_SHA", -"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", -"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", -"TLS_RSA_WITH_AES_128_CBC_SHA256", -"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256", -"TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", -"TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", -"TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", -"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", -"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA", -"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", -"TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", -"TLS_DHE_DSS_WITH_AES_128_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", -"TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA", -"TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA", -"TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", -"TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA", -"TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA", -"TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", -"SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA", -"SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5", -"TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", -"TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA", -"SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA", -"TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", -"SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA", -"SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA", -"TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", -"TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA", -
(nutch) branch master updated: NUTCH-3031 ProtocolFactory host mapper to support domains
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new c390dfc8b NUTCH-3031 ProtocolFactory host mapper to support domains c390dfc8b is described below commit c390dfc8b5c15db74d61c83e79f8e17d9bdc7b3f Author: Markus Jelsma AuthorDate: Tue Mar 12 17:29:20 2024 + NUTCH-3031 ProtocolFactory host mapper to support domains --- src/java/org/apache/nutch/protocol/ProtocolFactory.java | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java b/src/java/org/apache/nutch/protocol/ProtocolFactory.java index a545a4cd0..dc274b7e1 100644 --- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java +++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java @@ -29,6 +29,7 @@ import org.apache.nutch.plugin.ExtensionPoint; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.plugin.PluginRuntimeException; import org.apache.nutch.util.ObjectCache; +import org.apache.nutch.util.URLUtil; import org.apache.commons.lang.StringUtils; @@ -130,8 +131,16 @@ public class ProtocolFactory { // First attempt to resolve a protocol implementation by hostname String host = url.getHost(); + String domain = URLUtil.getDomainName(url).toLowerCase().trim(); + String hostOrDomain = null; + Extension extension = null; if (hostProtocolMapping.containsKey(host)) { -Extension extension = getExtensionById(hostProtocolMapping.get(host)); +hostOrDomain = host; + } else if (hostProtocolMapping.containsKey(domain)) { +hostOrDomain = domain; + } + if (hostOrDomain != null) { +extension = getExtensionById(hostProtocolMapping.get(hostOrDomain)); if (extension != null) { protocol = getProtocolInstanceByExtension(extension); } @@ -141,7 +150,7 @@ public class ProtocolFactory { if (protocol == null) { // Protocol listed in default map? if (defaultProtocolImplMapping.containsKey(url.getProtocol())) { - Extension extension = getExtensionById(defaultProtocolImplMapping.get(url.getProtocol())); + extension = getExtensionById(defaultProtocolImplMapping.get(url.getProtocol())); if (extension != null) { protocol = getProtocolInstanceByExtension(extension); } @@ -150,7 +159,7 @@ public class ProtocolFactory { // Still couldn't find a protocol? Attempt by protocol if (protocol == null) { -Extension extension = findExtension(url.getProtocol(), "protocolName"); +extension = findExtension(url.getProtocol(), "protocolName"); if (extension != null) { protocol = getProtocolInstanceByExtension(extension); }
(nutch) branch master updated: NUTCH-3027 Trivial resource leak patch in DomainSuffixes.java
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new d95e1a79d NUTCH-3027 Trivial resource leak patch in DomainSuffixes.java new 6b0455454 Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch d95e1a79d is described below commit d95e1a79d665dfd10ae88e9985f3d85e398a751e Author: Markus Jelsma AuthorDate: Fri Jan 19 12:53:40 2024 +0100 NUTCH-3027 Trivial resource leak patch in DomainSuffixes.java --- src/java/org/apache/nutch/util/domain/DomainSuffixes.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java index ae0d31b52..455f36712 100644 --- a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java +++ b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java @@ -41,9 +41,9 @@ public class DomainSuffixes { /** private ctor */ private DomainSuffixes() { String file = "domain-suffixes.xml"; -InputStream input = this.getClass().getClassLoader() -.getResourceAsStream(file); -try { + +try (InputStream input = this.getClass().getClassLoader() +.getResourceAsStream(file)) { new DomainSuffixesReader().read(this, input); } catch (Exception ex) { LOG.warn(StringUtils.stringifyException(ex));
[nutch] branch master updated: NUTCH-2924 Generate maxCount expr evaluated only once
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 7d3900450 NUTCH-2924 Generate maxCount expr evaluated only once 7d3900450 is described below commit 7d390045049036541d2fd94302ab97c8cb3e3cb1 Author: Markus Jelsma AuthorDate: Mon Dec 12 16:13:40 2022 +0100 NUTCH-2924 Generate maxCount expr evaluated only once --- src/java/org/apache/nutch/crawl/Generator.java | 103 +++-- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 0fce6b3b0..8a2f87ba4 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -311,27 +311,30 @@ public class Generator extends NutchTool implements Tool { private SequenceFile.Reader[] hostdbReaders = null; private JexlScript maxCountExpr = null; private JexlScript fetchDelayExpr = null; - -public void open() { - if (conf.get(GENERATOR_HOSTDB) != null) { -try { - Path path = new Path(conf.get(GENERATOR_HOSTDB), "current"); - hostdbReaders = SegmentReaderUtil.getReaders(path, conf); -} catch (IOException e) { - LOG.error("Error reading HostDB because {}", e.getMessage()); -} +private Map hostDatumCache = new HashMap<>(); + +public void readHostDb() throws IOException { + if (conf.get(GENERATOR_HOSTDB) == null) { +return; } -} - -public void close() { - if (hostdbReaders != null) { -try { - for (int i = 0; i < hostdbReaders.length; i++) { -hostdbReaders[i].close(); + + Path path = new Path(conf.get(GENERATOR_HOSTDB), "current"); + hostdbReaders = SegmentReaderUtil.getReaders(path, conf); + + try { +Text key = new Text(); +HostDatum value = new HostDatum(); +for (int i = 0; i < hostdbReaders.length; i++) { + while (hostdbReaders[i].next(key, value)) { +hostDatumCache.put(key.toString(), (HostDatum)value.clone()); } -} catch (IOException e) { - LOG.error("Error closing HostDB because {}", e.getMessage()); } + } catch (Exception e) { +throw new IOException(e); + } + + for (int i = 0; i < hostdbReaders.length; i++) { +hostdbReaders[i].close(); } } @@ -402,6 +405,8 @@ public class Generator extends NutchTool implements Tool { fetchDelayExpr = JexlUtil .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null)); } + + readHostDb(); } @Override @@ -414,7 +419,7 @@ public class Generator extends NutchTool implements Tool { public void reduce(FloatWritable key, Iterable values, Context context) throws IOException, InterruptedException { - String hostname = null; + String currentHostname = null; HostDatum host = null; LongWritable variableFetchDelayWritable = null; // in millis Text variableFetchDelayKey = new Text("_variableFetchDelay_"); @@ -425,33 +430,31 @@ public class Generator extends NutchTool implements Tool { String urlString = url.toString(); URL u = null; -// Do this only once per queue -if (host == null) { - try { -hostname = URLUtil.getHost(urlString); -host = getHostDatum(hostname); - } catch (Exception e) { - } +String hostname = URLUtil.getHost(urlString); +if (!hostname.equals(currentHostname)) { + currentHostname = hostname; + host = hostDatumCache.get(hostname); // Got it? - if (host == null) { -// Didn't work, prevent future lookups -host = new HostDatum(); - } else { + if (host != null) { if (maxCountExpr != null) { - long variableMaxCount = Math - .round((double) maxCountExpr.execute(createContext(host))); - LOG.info("Generator: variable maxCount: {} for {}", - variableMaxCount, hostname); - maxCount = (int) variableMaxCount; + try { +long variableMaxCount = Math.round((double)maxCountExpr.execute(createContext(host))); +LOG.debug("Generator: variable maxCount: {} for {}", variableMaxCount, hostname); +maxCount = (int)variableMaxCount; + } catch (Exception e) { +LOG.error("Unable to execute variable maxCount expression because: " + e.getMessage(), e); + } } if (fetchDelay
[nutch] branch master updated: NUTCH-2977
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new d806aa450 NUTCH-2977 d806aa450 is described below commit d806aa4507c59dcd680eac6f116df1eab22d996a Author: Markus Jelsma AuthorDate: Wed Dec 7 18:08:53 2022 +0100 NUTCH-2977 --- build.xml | 4 1 file changed, 4 insertions(+) diff --git a/build.xml b/build.xml index d7377ab25..004a12191 100644 --- a/build.xml +++ b/build.xml @@ -86,6 +86,10 @@ + + + +
[nutch] branch master updated: NUTCH-2794 Add additional ciphers to HTTP base's default cipher suite
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 1c2e411 NUTCH-2794 Add additional ciphers to HTTP base's default cipher suite 1c2e411 is described below commit 1c2e4110ca4f4d739c6f9cde42d7a54ab52fa860 Author: Markus Jelsma AuthorDate: Wed Jun 17 13:21:24 2020 +0200 NUTCH-2794 Add additional ciphers to HTTP base's default cipher suite --- .../src/java/org/apache/nutch/protocol/http/api/HttpBase.java | 8 1 file changed, 8 insertions(+) diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index d7e330e..30e2432 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -311,6 +311,14 @@ public abstract class HttpBase implements Protocol { String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", +"ECDHE-ECDSA-AES128-GCM-SHA256", +"ECDHE-RSA-AES128-GCM-SHA256", +"ECDHE-ECDSA-AES256-GCM-SHA384", +"ECDHE-RSA-AES256-GCM-SHA384", +"ECDHE-ECDSA-CHACHA20-POLY1305", +"ECDHE-RSA-CHACHA20-POLY1305", +"DHE-RSA-AES128-GCM-SHA256", +"DHE-RSA-AES256-GCM-SHA384", "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", "TLS_RSA_WITH_AES_256_CBC_SHA256",
[nutch] branch master updated: NUTCH-2612 Support for sitemap processing by hostname
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 9dbb4be NUTCH-2612 Support for sitemap processing by hostname new 87b08fc Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch 9dbb4be is described below commit 9dbb4be71b248f61437375b21fc29934e03190db Author: Markus Jelsma AuthorDate: Mon Sep 9 15:00:30 2019 +0200 NUTCH-2612 Support for sitemap processing by hostname --- .../org/apache/nutch/util/SitemapProcessor.java| 98 +- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index cbfbe0c..18e3871 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -132,46 +132,27 @@ public class SitemapProcessor extends Configured implements Tool { context.write(key, (CrawlDatum) value); } else if (value instanceof HostDatum) { - // For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap, - // extract urls and emit those - - // try different combinations of schemes one by one till we get rejection in all cases - String host = key.toString(); - if((url = filterNormalize("http://; + host + "/")) == null && - (url = filterNormalize("https://; + host + "/")) == null && - (url = filterNormalize("ftp://; + host + "/")) == null && - (url = filterNormalize("file:/" + host + "/")) == null) { -context.getCounter("Sitemap", "filtered_records").increment(1); -return; - } - // We may wish to use the robots.txt content as the third parameter for .getRobotRules - BaseRobotRules rules = protocolFactory.getProtocol(url).getRobotRules(new Text(url), datum, null); - List sitemaps = rules.getSitemaps(); - - if (tryDefaultSitemapXml && sitemaps.size() == 0) { -sitemaps.add(url + "sitemap.xml"); - } - for (String sitemap : sitemaps) { -context.getCounter("Sitemap", "sitemaps_from_hostdb").increment(1); -sitemap = filterNormalize(sitemap); -if (sitemap == null) { - context.getCounter("Sitemap", "filtered_sitemaps_from_hostdb") - .increment(1); -} else { - generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap), - sitemap, context); -} - } + generateSitemapsFromHostname(key.toString(), context); } else if (value instanceof Text) { - // For entry from sitemap urls file, fetch the sitemap, extract urls and emit those - if((url = filterNormalize(key.toString())) == null) { -context.getCounter("Sitemap", "filtered_records").increment(1); -return; - } + // Input can be sitemap URL or hostname + url = key.toString(); + if (url.startsWith("http://;) || +url.startsWith("https://;) || +url.startsWith("ftp://;) || +url.startsWith("file:/")) { +// For entry from sitemap urls file, fetch the sitemap, extract urls and emit those +if((url = filterNormalize(url)) == null) { + context.getCounter("Sitemap", "filtered_records").increment(1); + return; +} - context.getCounter("Sitemap", "sitemap_seeds").increment(1); - generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context); +context.getCounter("Sitemap", "sitemap_seeds").increment(1); +generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context); + } else { +LOG.info("generateSitemapsFromHostname: " + key.toString()); +generateSitemapsFromHostname(key.toString(), context); + } } } catch (Exception e) { LOG.warn("Exception for record {} : {}", key.toString(), StringUtils.stringifyException(e)); @@ -191,6 +172,43 @@ public class SitemapProcessor extends Configured implements Tool { } return url; } + +private void generateSitemapsFromHostname(String host, Context context) { + try { +// For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap, +// extract urls and emit those + +// try diffe
[nutch] branch master updated: NUTCH-2725 Plugin lib-http to support per-host configurable cookies
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 54f73bf NUTCH-2725 Plugin lib-http to support per-host configurable cookies 54f73bf is described below commit 54f73bf78ded8b66ba262270d069232417bbe391 Author: Markus Jelsma AuthorDate: Mon Jul 29 12:44:49 2019 +0200 NUTCH-2725 Plugin lib-http to support per-host configurable cookies --- conf/cookies.txt | 3 ++ conf/nutch-default.xml | 8 .../apache/nutch/protocol/http/api/HttpBase.java | 56 ++ .../apache/nutch/protocol/http/HttpResponse.java | 23 ++--- .../nutch/protocol/httpclient/HttpResponse.java| 17 +-- .../nutch/protocol/okhttp/OkHttpResponse.java | 19 ++-- 6 files changed, 111 insertions(+), 15 deletions(-) diff --git a/conf/cookies.txt b/conf/cookies.txt new file mode 100644 index 000..f75f220 --- /dev/null +++ b/conf/cookies.txt @@ -0,0 +1,3 @@ +# Optional per-host configurable cookies. Format: +# +# \t diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a9ce899..e88991c 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -190,6 +190,14 @@ + http.agent.host.cookie.file + cookies.txt + +File containing per-host configured cookies. + + + + http.agent.host Name or IP address of the host on which the Nutch crawler diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index bcc2e29..4b91f9c 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -28,6 +28,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.ThreadLocalRandom; @@ -45,6 +46,7 @@ import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.util.GZIPUtils; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.DeflateUtils; +import org.apache.nutch.util.URLUtil; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -66,6 +68,9 @@ public abstract class HttpBase implements Protocol { private HttpRobotRulesParser robots = null; private ArrayList userAgentNames = null; + + /** Mapping hostnames to cookies */ + private Map hostCookies = null; /** The proxy hostname. */ protected String proxyHost = null; @@ -257,6 +262,42 @@ public abstract class HttpBase implements Protocol { .warn("Falling back to fixed user agent set via property http.agent.name"); } } + +// If cookies are enabled, try to load a per-host cookie file +if (enableCookieHeader) { + String cookieFile = conf.get("http.agent.host.cookie.file", "cookies.txt"); + BufferedReader br = null; + try { +Reader reader = conf.getConfResourceAsReader(cookieFile); +br = new BufferedReader(reader); +hostCookies = new HashMap(); +String word = ""; +while ((word = br.readLine()) != null) { + if (!word.trim().isEmpty()) { +if (word.indexOf("#") == -1) { // skip comment + String[] parts = word.split("\t"); + if (parts.length == 2) { +hostCookies.put(parts[0], parts[1]); + } else { +LOG.warn("Unable to parse cookie file correctly at: " + word); + } +} + } +} + } catch (Exception e) { +logger.warn("Failed to read http.agent.host.cookie.file {}: {}", cookieFile, +StringUtils.stringifyException(e)); +hostCookies = null; + } finally { +if (br != null) { + try { +br.close(); + } catch (IOException e) { +// ignore + } +} + } +} String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); @@ -479,6 +520,21 @@ public abstract class HttpBase implements Protocol { } return userAgent; } + + /** + * If per-host cookies are configured, this method will look it up + * for the given url. + * + * @param url the url to look-up a cookie for + * @return the cookie or null + */ + public String getCookie(URL url) { +if (hostCookies != null) { + return hostCookies.get(url.getHost()); +} + +return null; + } /** * Value of "Accept-Language&
[nutch] branch master updated: NUTCH-2724 Metadata indexer not to emit empty values
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new a67c9be NUTCH-2724 Metadata indexer not to emit empty values a67c9be is described below commit a67c9bee94049d37dad9278cdf8dd9131735da43 Author: Markus Jelsma AuthorDate: Mon Jul 15 12:25:42 2019 +0200 NUTCH-2724 Metadata indexer not to emit empty values --- .../java/org/apache/nutch/indexer/metadata/MetadataIndexer.java| 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java index 3927bd3..be56377 100644 --- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java +++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java @@ -93,12 +93,15 @@ public class MetadataIndexer implements IndexingFilter { protected void add(NutchDocument doc, String key, String value) { if (separator == null || value.indexOf(separator) == -1 || !mvFields.contains(key)) { - doc.add(key, value); + value = value.trim(); + if (!value.isEmpty()) { +doc.add(key, value); + } } else { String[] parts = value.split(separator); for (String part : parts) { part = part.trim(); -if (part.length() != 0) { +if (!part.isEmpty()) { doc.add(key, part); } }
[nutch] branch master updated: NUTCH-2723 Indexer Solr not to decode URLs before deletion
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 5150c44 NUTCH-2723 Indexer Solr not to decode URLs before deletion new 9692464 Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch 5150c44 is described below commit 5150c442a78d15c042ee6fb12e6dbea8ec5341e6 Author: Markus Jelsma AuthorDate: Fri Jul 12 12:09:34 2019 +0200 NUTCH-2723 Indexer Solr not to decode URLs before deletion --- .../org/apache/nutch/indexwriter/solr/SolrIndexWriter.java | 12 1 file changed, 12 deletions(-) diff --git a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java index 475d313..cc2e8d7 100644 --- a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java +++ b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java @@ -18,8 +18,6 @@ package org.apache.nutch.indexwriter.solr; import java.lang.invoke.MethodHandles; import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; import java.time.format.DateTimeFormatter; import java.util.AbstractMap; import java.util.ArrayList; @@ -153,16 +151,6 @@ public class SolrIndexWriter implements IndexWriter { } public void delete(String key) throws IOException { -try { - key = URLDecoder.decode(key, "UTF8"); -} catch (UnsupportedEncodingException e) { - LOG.error("Error decoding: " + key); - throw new IOException("UnsupportedEncodingException for " + key); -} catch (IllegalArgumentException e) { - LOG.warn("Could not decode: " + key - + ", it probably wasn't encoded in the first place.."); -} - // escape solr hash separator key = key.replaceAll("!", "\\!");
[nutch] branch master updated: NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 7e6eabb NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages 7e6eabb is described below commit 7e6eabbc2b0a0b5ee91148a9effc6447af5057ba Author: Markus Jelsma AuthorDate: Thu Apr 11 12:32:22 2019 +0200 NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages --- conf/nutch-default.xml | 9 + .../org/apache/nutch/parse/tika/TikaParser.java| 22 ++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a4b202f..951494e 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1600,6 +1600,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this + + tika.extractor.boilerpipe.mime.types + text/html,application/xhtml+xml + +Comma-separated list of MIME types accepted for Boilerpipe extraction, +documents of other MIME types are not passed to the Boilerpipe extractor. + + + diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 7440333..40aa265 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -21,8 +21,11 @@ import java.io.ByteArrayInputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -73,6 +76,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { private boolean upperCaseElementNames = true; private String boilerpipeExtractorName; private boolean useBoilerpipe; + private Set boilerpipeMimeTypes; public ParseResult getParse(Content content) { HTMLDocumentImpl doc = new HTMLDocumentImpl(); @@ -114,7 +118,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { ContentHandler domHandler; // Check whether to use Tika's BoilerplateContentHandler -if (useBoilerpipe) { +if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) { BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler( (ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); @@ -291,16 +295,18 @@ public class TikaParser implements org.apache.nutch.parse.Parser { } } -htmlParseFilters = new HtmlParseFilters(getConf()); +htmlParseFilters = new HtmlParseFilters(conf); utils = new DOMContentUtils(conf); -cachingPolicy = getConf().get("parser.caching.forbidden.policy", +cachingPolicy = conf.get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT); -upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names", +upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names", true); -useBoilerpipe = getConf().get("tika.extractor", "none") -.equals("boilerpipe"); -boilerpipeExtractorName = getConf() -.get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor"); +useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe"); +boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm", +"ArticleExtractor"); +boilerpipeMimeTypes = new HashSet<>(Arrays +.asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types", +"text/html", "application/xhtml+xml"))); } public Configuration getConf() {
[nutch] branch master updated: NUTCH-2692 Removing previously accidentally added file
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new f7fdca3 NUTCH-2692 Removing previously accidentally added file f7fdca3 is described below commit f7fdca37fe15f95955ec9082943a9392a578b728 Author: Markus Jelsma AuthorDate: Fri Feb 22 17:07:29 2019 +0100 NUTCH-2692 Removing previously accidentally added file --- conf/host-protocol-mapping.txt | 11 --- 1 file changed, 11 deletions(-) diff --git a/conf/host-protocol-mapping.txt b/conf/host-protocol-mapping.txt deleted file mode 100644 index d0a1b70..000 --- a/conf/host-protocol-mapping.txt +++ /dev/null @@ -1,11 +0,0 @@ -# This file defines a hostname to protocol plugin mapping. Each line takes a -# host name followed by a tab, followed by the ID of the protocol plugin. You -# can find the ID in the protocol plugin's plugin.xml file. -# -# \t\n -# nutch.apache.org org.apache.nutch.protocol.httpclient.Http -# tika.apache.org org.apache.nutch.protocol.http.Http -# -nutch.apache.org org.apache.nutch.protocol.httpclient.Http -tika.apache.orgorg.apache.nutch.protocol.http.Http -
[nutch] 02/03: NUTCH-2692 Subcollection to support case-insensitive white and black lists
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git commit 3fa2f4a7efac598258eb01a4387b5fde43c1a813 Author: Markus Jelsma AuthorDate: Fri Feb 22 16:46:42 2019 +0100 NUTCH-2692 Subcollection to support case-insensitive white and black lists --- conf/host-protocol-mapping.txt | 11 +++ 1 file changed, 11 insertions(+) diff --git a/conf/host-protocol-mapping.txt b/conf/host-protocol-mapping.txt new file mode 100644 index 000..d0a1b70 --- /dev/null +++ b/conf/host-protocol-mapping.txt @@ -0,0 +1,11 @@ +# This file defines a hostname to protocol plugin mapping. Each line takes a +# host name followed by a tab, followed by the ID of the protocol plugin. You +# can find the ID in the protocol plugin's plugin.xml file. +# +# \t\n +# nutch.apache.org org.apache.nutch.protocol.httpclient.Http +# tika.apache.org org.apache.nutch.protocol.http.Http +# +nutch.apache.org org.apache.nutch.protocol.httpclient.Http +tika.apache.orgorg.apache.nutch.protocol.http.Http +
[nutch] 01/03: NUTCH-2692 Subcollection to support case-insensitive white and black lists
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git commit 89c41e1b5a245322b27e8dd0728b543faa171e9d Author: Markus Jelsma AuthorDate: Fri Feb 22 16:44:25 2019 +0100 NUTCH-2692 Subcollection to support case-insensitive white and black lists --- conf/nutch-default.xml | 8 .../src/java/org/apache/nutch/collection/Subcollection.java | 13 - .../indexer/subcollection/SubcollectionIndexingFilter.java | 6 ++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a42e6a9..69fbb7d 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2407,6 +2407,14 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> + + subcollection.case.insensitive + false + + Whether the URL prefixes are to be treated case insensitive. + + + diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java index 13064eb..8478390 100644 --- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java +++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java @@ -69,6 +69,11 @@ public class Subcollection extends Configured implements URLFilter { * SubCollection blacklist as String */ String blString; + + /** + * Whether the white and black lists are case sensitive + */ + boolean caseInsensitive = false; /** * public Constructor @@ -95,10 +100,12 @@ public class Subcollection extends Configured implements URLFilter { this.id = id; this.key = key; this.name = name; +caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false); } public Subcollection(Configuration conf) { super(conf); +caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false); } /** @@ -231,7 +238,11 @@ public class Subcollection extends Configured implements URLFilter { while (st.hasMoreElements()) { String line = (String) st.nextElement(); - list.add(line.trim()); + line = line.trim(); + if (caseInsensitive) { +line = line.toLowerCase(); + } + list.add(line); } } diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java index 898d314..767d54d 100644 --- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java +++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java @@ -36,6 +36,7 @@ public class SubcollectionIndexingFilter extends Configured implements IndexingFilter { private Configuration conf; + private boolean caseInsensitive = false; public SubcollectionIndexingFilter() { super(NutchConfiguration.create()); @@ -52,7 +53,9 @@ public class SubcollectionIndexingFilter extends Configured implements this.conf = conf; fieldName = conf.get("subcollection.default.fieldname", "subcollection"); metadataSource = conf.get("subcollection.metadata.source", "subcollection"); +caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false); } + /** * @return Configuration @@ -102,6 +105,9 @@ public class SubcollectionIndexingFilter extends Configured implements } String sUrl = url.toString(); +if (caseInsensitive) { + sUrl = sUrl.toLowerCase(); +} addSubCollectionField(doc, sUrl); return doc; }
[nutch] branch master updated (78af89f -> 0085ee7)
This is an automated email from the ASF dual-hosted git repository. markus pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git. from 78af89f Merge pull request #436 from r0ann3l/NUTCH-2684 new 89c41e1 NUTCH-2692 Subcollection to support case-insensitive white and black lists new 3fa2f4a NUTCH-2692 Subcollection to support case-insensitive white and black lists new 0085ee7 Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: ...tocol-mapping.txt.template => host-protocol-mapping.txt} | 13 - conf/nutch-default.xml | 8 .../src/java/org/apache/nutch/collection/Subcollection.java | 13 - .../indexer/subcollection/SubcollectionIndexingFilter.java | 6 ++ 4 files changed, 30 insertions(+), 10 deletions(-) copy conf/{host-protocol-mapping.txt.template => host-protocol-mapping.txt} (50%)
[nutch] 03/03: Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git commit 0085ee740e78b58091d1aa39614277f1a612810c Merge: 3fa2f4a 78af89f Author: Markus Jelsma AuthorDate: Fri Feb 22 16:48:45 2019 +0100 Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch conf/nutch-default.xml | 18 - src/java/org/apache/nutch/crawl/CrawlDbReader.java | 2 +- .../org/apache/nutch/crawl/CrawlDbReducer.java | 4 +- src/java/org/apache/nutch/crawl/Generator.java | 8 +- src/java/org/apache/nutch/fetcher/QueueFeeder.java | 91 -- .../apache/nutch/hostdb/UpdateHostDbMapper.java| 3 - .../apache/nutch/hostdb/UpdateHostDbReducer.java | 2 - .../nutch/indexer/IndexingFiltersChecker.java | 2 +- .../org/apache/nutch/net/protocols/Response.java | 2 +- .../org/apache/nutch/parse/OutlinkExtractor.java | 2 +- src/java/org/apache/nutch/parse/ParseData.java | 18 + .../org/apache/nutch/parse/ParsePluginsReader.java | 2 +- .../org/apache/nutch/segment/SegmentMerger.java| 4 +- .../org/apache/nutch/service/impl/LinkReader.java | 8 +- .../org/apache/nutch/service/impl/NodeReader.java | 8 +- .../service/impl/NutchServerPoolExecutor.java | 2 +- .../apache/nutch/service/impl/SequenceReader.java | 8 +- .../org/apache/nutch/tools/arc/ArcInputFormat.java | 4 +- .../apache/nutch/tools/arc/ArcRecordReader.java| 2 +- .../apache/nutch/tools/arc/ArcSegmentCreator.java | 4 +- .../org/apache/nutch/util/EncodingDetector.java| 6 +- src/java/org/apache/nutch/util/MimeUtil.java | 3 +- src/plugin/indexer-cloudsearch/README.md | 54 ++--- src/plugin/indexer-csv/README.md | 42 ++ .../nutch/indexwriter/csv/CSVIndexWriter.java | 4 +- src/plugin/indexer-dummy/README.md | 34 src/plugin/indexer-elastic-rest/README.md | 45 +++ src/plugin/indexer-elastic/README.md | 41 ++ src/plugin/indexer-rabbit/README.md| 44 +++ src/plugin/indexer-solr/README.md | 40 ++ .../apache/nutch/parse/html/HTMLMetaProcessor.java | 45 +-- .../apache/nutch/parse/tika/HTMLMetaProcessor.java | 45 +-- .../apache/nutch/protocol/http/HttpResponse.java | 4 +- .../org/apache/nutch/protocol/okhttp/OkHttp.java | 22 +++--- .../nutch/scoring/orphan/OrphanScoringFilter.java | 12 ++- .../scoring/orphan/TestOrphanScoringFilter.java| 4 +- src/test/org/apache/nutch/crawl/TestGenerator.java | 6 +- 37 files changed, 463 insertions(+), 182 deletions(-)
[nutch] branch master updated: NUTCH-2694 HostDB to aggregate by long instead of integer
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 33922fe NUTCH-2694 HostDB to aggregate by long instead of integer 33922fe is described below commit 33922feb804d740180fb4abd833884dae6d62cc0 Author: Markus Jelsma AuthorDate: Fri Feb 22 14:08:08 2019 +0100 NUTCH-2694 HostDB to aggregate by long instead of integer --- CHANGES.txt| 9 +- src/java/org/apache/nutch/hostdb/HostDatum.java| 110 ++--- .../org/apache/nutch/hostdb/ResolverThread.java| 6 +- .../apache/nutch/hostdb/UpdateHostDbReducer.java | 34 +++ 4 files changed, 81 insertions(+), 78 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 96bd05a..12f5aad 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -6,9 +6,12 @@ Comments Breaking Changes -The value of crawl.gen.delay is now read in milliseconds as stated in the description -in nutch-default.xml. Previously, the value has been read in days, see NUTCH-1842 for -further information. +- The value of crawl.gen.delay is now read in milliseconds as stated in the description + in nutch-default.xml. Previously, the value has been read in days, see NUTCH-1842 for + further information. + +- HostDB entries have been moved from Integer to Long in order to accomodate very large + hosts. Remove your existing HostDB and recreate it with bin/nutch updatehostdb. Nutch 1.15 Release (25/07/2018) diff --git a/src/java/org/apache/nutch/hostdb/HostDatum.java b/src/java/org/apache/nutch/hostdb/HostDatum.java index fe3b73e..2bc9244 100644 --- a/src/java/org/apache/nutch/hostdb/HostDatum.java +++ b/src/java/org/apache/nutch/hostdb/HostDatum.java @@ -30,7 +30,7 @@ import org.apache.hadoop.io.Writable; /** */ public class HostDatum implements Writable, Cloneable { - protected int failures = 0; + protected long failures = 0; protected float score = 0; protected Date lastCheck = new Date(0); protected String homepageUrl = new String(); @@ -38,17 +38,17 @@ public class HostDatum implements Writable, Cloneable { protected MapWritable metaData = new MapWritable(); // Records the number of times DNS look-up failed, may indicate host no longer exists - protected int dnsFailures = 0; + protected long dnsFailures = 0; // Records the number of connection failures, may indicate our netwerk being blocked by firewall - protected int connectionFailures = 0; + protected long connectionFailures = 0; - protected int unfetched = 0; - protected int fetched = 0; - protected int notModified = 0; - protected int redirTemp = 0; - protected int redirPerm = 0; - protected int gone = 0; + protected long unfetched = 0; + protected long fetched = 0; + protected long notModified = 0; + protected long redirTemp = 0; + protected long redirPerm = 0; + protected long gone = 0; public HostDatum() { } @@ -68,15 +68,15 @@ public class HostDatum implements Writable, Cloneable { } public void resetFailures() { -setDnsFailures(0); -setConnectionFailures(0); +setDnsFailures(0l); +setConnectionFailures(0l); } - public void setDnsFailures(Integer dnsFailures) { + public void setDnsFailures(Long dnsFailures) { this.dnsFailures = dnsFailures; } - public void setConnectionFailures(Integer connectionFailures) { + public void setConnectionFailures(Long connectionFailures) { this.connectionFailures = connectionFailures; } @@ -88,15 +88,15 @@ public class HostDatum implements Writable, Cloneable { this.connectionFailures++; } - public Integer numFailures() { + public Long numFailures() { return getDnsFailures() + getConnectionFailures(); } - public Integer getDnsFailures() { + public Long getDnsFailures() { return dnsFailures; } - public Integer getConnectionFailures() { + public Long getConnectionFailures() { return connectionFailures; } @@ -120,7 +120,7 @@ public class HostDatum implements Writable, Cloneable { return score; } - public Integer numRecords() { + public Long numRecords() { return unfetched + fetched + gone + redirPerm + redirTemp + notModified; } @@ -140,51 +140,51 @@ public class HostDatum implements Writable, Cloneable { this.homepageUrl = homepageUrl; } - public void setUnfetched(int val) { + public void setUnfetched(long val) { unfetched = val; } - public int getUnfetched() { + public long getUnfetched() { return unfetched; } - public void setFetched(int val) { + public void setFetched(long val) { fetched = val; } - public int getFetched() { + public long getFetched() { return fetched; } - public void setNotModified(int val) { + public void setNotModified(long val) { notModified = val
[nutch] branch master updated: NUTCH-2687 Regex for reading title from Content-Disposition is wrong
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 9cc076f NUTCH-2687 Regex for reading title from Content-Disposition is wrong 9cc076f is described below commit 9cc076f33746c34acfdeef8b3007bb5b0dec736d Author: Markus Jelsma AuthorDate: Fri Jan 18 11:36:49 2019 +0100 NUTCH-2687 Regex for reading title from Content-Disposition is wrong --- .../src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java index c16d233..8c4a2d6 100644 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java +++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java @@ -273,7 +273,7 @@ public class MoreIndexingFilter implements IndexingFilter { static { try { // order here is important - patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]"); + patterns[0] = Pattern.compile("\\bfilename=['\"]([^\"]+)"); patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b"); } catch (PatternSyntaxException e) { // just ignore
[nutch] branch master updated: NUTCH-2647 Skip TLS certificate checks in protocol-http plugin
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 61d7e8c NUTCH-2647 Skip TLS certificate checks in protocol-http plugin 61d7e8c is described below commit 61d7e8ce440aa544ce23e98a6fc6f811c482c5a0 Author: Markus Jelsma AuthorDate: Fri Sep 28 11:25:31 2018 +0200 NUTCH-2647 Skip TLS certificate checks in protocol-http plugin --- .../nutch/protocol/http/DummyX509TrustManager.java | 93 ++ .../apache/nutch/protocol/http/HttpResponse.java | 14 ++-- 2 files changed, 102 insertions(+), 5 deletions(-) diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java new file mode 100644 index 000..879f703 --- /dev/null +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Based on EasyX509TrustManager from commons-httpclient. + */ + +package org.apache.nutch.protocol.http; + +import java.lang.invoke.MethodHandles; +import java.security.KeyStore; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; + +import javax.net.ssl.TrustManagerFactory; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DummyX509TrustManager implements X509TrustManager { + private X509TrustManager standardTrustManager = null; + + /** Logger object for this class. */ + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** + * Constructor for DummyX509TrustManager. + */ + public DummyX509TrustManager(KeyStore keystore) + throws NoSuchAlgorithmException, KeyStoreException { +super(); +String algo = TrustManagerFactory.getDefaultAlgorithm(); +TrustManagerFactory factory = TrustManagerFactory.getInstance(algo); +factory.init(keystore); +TrustManager[] trustmanagers = factory.getTrustManagers(); +if (trustmanagers.length == 0) { + throw new NoSuchAlgorithmException(algo + " trust manager not supported"); +} +this.standardTrustManager = (X509TrustManager) trustmanagers[0]; + } + + /** + * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[], + * String) + */ + public boolean isClientTrusted(X509Certificate[] certificates) { +return true; + } + + /** + * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[], + * String) + */ + public boolean isServerTrusted(X509Certificate[] certificates) { +return true; + } + + /** + * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers() + */ + public X509Certificate[] getAcceptedIssuers() { +return this.standardTrustManager.getAcceptedIssuers(); + } + + public void checkClientTrusted(X509Certificate[] arg0, String arg1) + throws CertificateException { +// do nothing + + } + + public void checkServerTrusted(X509Certificate[] arg0, String arg1) + throws CertificateException { +// do nothing + + } +} diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 4b5544e..95ae352 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -30,8 +30,10 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; +import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocket; import javax.net.ssl.SSLSocketFactory; +import javax.net.ssl.TrustManager; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; @@
[nutch] branch master updated: NUTCH-2411 Index-metadata to support indexing multiple values for a field
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 9a77f43 NUTCH-2411 Index-metadata to support indexing multiple values for a field 9a77f43 is described below commit 9a77f43774b2c3cd70785895afb989e9ee2d8d5f Author: Markus Jelsma <mar...@apache.org> AuthorDate: Thu Mar 8 14:03:12 2018 +0100 NUTCH-2411 Index-metadata to support indexing multiple values for a field --- conf/nutch-default.xml | 9 ++ .../nutch/indexer/metadata/MetadataIndexer.java| 35 ++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 87c4058..71ef51b 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1738,6 +1738,15 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> + + index.metadata.separator + + + Separator to use if you want to index multiple values for a given field. Leave empty to + treat each value as a single value. + + + index.geoip.usage diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java index edb8b15..74d9eb1 100644 --- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java +++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java @@ -17,9 +17,12 @@ package org.apache.nutch.indexer.metadata; +import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.Locale; import java.util.Map; +import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; @@ -35,17 +38,21 @@ import org.apache.nutch.parse.Parse; * Indexer which can be configured to extract metadata from the crawldb, parse * metadata or content metadata. You can specify the properties "index.db.md", * "index.parse.md" or "index.content.md" who's values are comma-delimited - * Example : key1,key2,key3. + * key1,key2,key3. */ public class MetadataIndexer implements IndexingFilter { private Configuration conf; private String[] dbFieldnames; private Map<String, String> parseFieldnames; private String[] contentFieldnames; + private String separator; + private Set mvFields; private static final String db_CONF_PROPERTY = "index.db.md"; private static final String parse_CONF_PROPERTY = "index.parse.md"; private static final String content_CONF_PROPERTY = "index.content.md"; - + private static final String separator_CONF_PROPERTY = "index.metadata.separator"; + private static final String mvfields_CONF_PROPERTY = "index.metadata.multivalued.fields"; + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -58,7 +65,7 @@ public class MetadataIndexer implements IndexingFilter { for (String metatag : dbFieldnames) { Writable metadata = datum.getMetaData().get(new Text(metatag)); if (metadata != null) - doc.add(metatag, metadata.toString()); + add(doc, metatag, metadata.toString()); } } @@ -67,7 +74,7 @@ public class MetadataIndexer implements IndexingFilter { for (String metatag : parseFieldnames.keySet()) { for (String value : parse.getData().getParseMeta().getValues(metatag)) { if (value != null) -doc.add(parseFieldnames.get(metatag), value); +add(doc, parseFieldnames.get(metatag), value); } } } @@ -77,13 +84,27 @@ public class MetadataIndexer implements IndexingFilter { for (String metatag : contentFieldnames) { for (String value : parse.getData().getContentMeta().getValues(metatag)) { if (value != null) -doc.add(metatag, value); +add(doc, metatag, value); } } } return doc; } + + protected void add(NutchDocument doc, String key, String value) { +if (separator == null || value.indexOf(separator) == -1 || !mvFields.contains(key)) { + doc.add(key, value); +} else { + String[] parts = value.split(separator); + for (String part : parts) { +part = part.trim(); +if (part.length() != 0) { + doc.add(key, part); +} + } +} + } public void setConf(Configuration conf) { this.conf = conf; @@ -93,7 +114,9 @@ public class MetadataIndexer implements IndexingFilter { parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag); } contentFieldnames = conf.getStrings(content_CONF_P
[nutch] branch master updated: NUTCH-2458
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new c345618 NUTCH-2458 new 705686e Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch c345618 is described below commit c345618ec425f0e907a6e54565f2d0577139b45f Author: Markus Jelsma <mar...@apache.org> AuthorDate: Fri Nov 10 10:56:56 2017 +0100 NUTCH-2458 --- .../parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 49dc378..73cd083 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -239,7 +239,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { // see if a Tika config file can be found in the job file URL customTikaConfig = conf.getResource(customConfFile); if (customTikaConfig != null) - tikaConfig = new TikaConfig(customTikaConfig); + tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader()); } catch (Exception e1) { String message = "Problem loading custom Tika configuration from " + customConfFile; -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
[nutch] branch master updated: NUTCH-2420 Bug in variable generate.max.count and fetcher.server.delay
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 6199492 NUTCH-2420 Bug in variable generate.max.count and fetcher.server.delay 6199492 is described below commit 6199492f5e1e8811022257c88dbf63f1e1c739d0 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Mon Nov 6 17:08:09 2017 +0100 NUTCH-2420 Bug in variable generate.max.count and fetcher.server.delay --- src/java/org/apache/nutch/crawl/Generator.java | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 21607ec..e5f4831 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -179,11 +179,16 @@ public class Generator extends NutchTool implements Tool { segCounts = new int[maxNumSegments]; if (job.get(GENERATOR_HOSTDB) != null) { +maxCountExpr = JexlUtil.parseExpression(job.get(GENERATOR_MAX_COUNT_EXPR, null)); +fetchDelayExpr = JexlUtil.parseExpression(job.get(GENERATOR_FETCH_DELAY_EXPR, null)); + } +} + +public void open() { + if (conf.get(GENERATOR_HOSTDB) != null) { try { - Path path = new Path(job.get(GENERATOR_HOSTDB), "current"); - hostdbReaders = SequenceFileOutputFormat.getReaders(job, path); - maxCountExpr = JexlUtil.parseExpression(job.get(GENERATOR_MAX_COUNT_EXPR, null)); - fetchDelayExpr = JexlUtil.parseExpression(job.get(GENERATOR_FETCH_DELAY_EXPR, null)); + Path path = new Path(conf.get(GENERATOR_HOSTDB), "current"); + hostdbReaders = SequenceFileOutputFormat.getReaders(conf, path); } catch (IOException e) { LOG.error("Error reading HostDB because {}", e.getMessage()); } @@ -287,14 +292,17 @@ public class Generator extends NutchTool implements Tool { Text key = new Text(); HostDatum value = new HostDatum(); + open(); for (int i = 0; i < hostdbReaders.length; i++) { while (hostdbReaders[i].next(key, value)) { if (host.equals(key.toString())) { +close(); return value; } } } + close(); return null; } -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
[nutch] branch master updated: NUTCH-2386 BasicURLNormalizer does not encode curly braces
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new bd8c847 NUTCH-2386 BasicURLNormalizer does not encode curly braces bd8c847 is described below commit bd8c8476b36a465159703c88b75eb08008650136 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Wed Oct 25 15:00:33 2017 +0200 NUTCH-2386 BasicURLNormalizer does not encode curly braces --- .../apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 2 +- .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 8 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index ffd22ce..b6033ae 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -250,7 +250,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { // Traverse over all bytes in this URL for (byte b: path.getBytes(utf8)) { // Is this a control character? - if (b < 33 || b == 91 || b == 93) { + if (b < 0x21 || b == 0x5B || b == 0x5D || b == 0x7B || b == 0x7D) { // Start escape sequence sb.append('%'); diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index 2625ea3..5cefbf3 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -171,6 +171,12 @@ public class TestBasicURLNormalizer { normalizeTest("http:;, "http:/"); normalizeTest("http:///;, "http:/"); } + + @Test + public void testCurlyBraces() throws Exception { +// check that leading and trailing spaces are removed +normalizeTest("http://foo.com/{{stuff}} ", "http://foo.com/%7B%7Bstuff%7D%7D;); + } private void normalizeTest(String weird, String normal) throws Exception { Assert.assertEquals("normalizing: " + weird, normal, @@ -181,4 +187,4 @@ public class TestBasicURLNormalizer { new TestBasicURLNormalizer().testNormalizer(); } -} \ No newline at end of file +} -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
[nutch] branch master updated: NUTCH-2445 Fetcher following outlinks to keep track of already fetched items
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 0cdd095 NUTCH-2445 Fetcher following outlinks to keep track of already fetched items 0cdd095 is described below commit 0cdd095c881eed52dc461e559ce6ae278e99157f Author: Markus Jelsma <mar...@apache.org> AuthorDate: Mon Oct 23 15:59:13 2017 +0200 NUTCH-2445 Fetcher following outlinks to keep track of already fetched items --- .../org/apache/nutch/fetcher/FetchItemQueue.java | 6 .../org/apache/nutch/fetcher/FetcherThread.java| 41 ++ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueue.java b/src/java/org/apache/nutch/fetcher/FetchItemQueue.java index b67be74..5096b37 100644 --- a/src/java/org/apache/nutch/fetcher/FetchItemQueue.java +++ b/src/java/org/apache/nutch/fetcher/FetchItemQueue.java @@ -22,6 +22,8 @@ import java.util.LinkedList; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.HashSet; +import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; @@ -51,6 +53,10 @@ public class FetchItemQueue { Text cookie; Text variableFetchDelayKey = new Text("_variableFetchDelay_"); boolean variableFetchDelaySet = false; + // keep track of duplicates if fetcher.follow.outlinks.depth > 0. Some urls may + // not get followed due to hash collisions. Hashing is used to reduce memory + // usage. + Set alreadyFetched = new HashSet<>(); public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay, long minCrawlDelay) { diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 77947b6..42d5d50 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -198,7 +198,7 @@ public class FetcherThread extends Thread { + " - forcing to byHost"); queueMode = FetchItemQueues.QUEUE_MODE_HOST; } -LOG.info("Using queue mode : " + queueMode); +LOG.info(getName() + " " + Thread.currentThread().getId() + " Using queue mode : " + queueMode); this.maxRedirect = conf.getInt("http.redirect.max", 3); maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100); @@ -219,7 +219,7 @@ public class FetcherThread extends Thread { if (storingContent) { robotsTxtContent = new LinkedList<>(); } else { -LOG.warn("Ignoring fetcher.store.robotstxt because not storing content (fetcher.store.content)!"); +LOG.warn(getName() + " " + Thread.currentThread().getId() + " Ignoring fetcher.store.robotstxt because not storing content (fetcher.store.content)!"); } } } @@ -262,7 +262,7 @@ public class FetcherThread extends Thread { continue; } else { // all done, finish this thread -LOG.info("Thread " + getName() + " has no more work available"); +LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available"); return; } } @@ -287,7 +287,7 @@ public class FetcherThread extends Thread { do { if (LOG.isInfoEnabled()) { - LOG.info("fetching " + fit.url + " (queue crawl delay=" + LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + fit.url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)"); } @@ -438,7 +438,7 @@ public class FetcherThread extends Thread { default: if (LOG.isWarnEnabled()) { -LOG.warn("Unknown ProtocolStatus: " + status.getCode()); +LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode()); } output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); @@ -447,7 +447,7 @@ public class FetcherThread extends Thread { if (redirecting && redirectCount > maxRedirect) { ((FetchItemQueues) fetchQueues).finishFetchItem(fit); if (LOG.isInfoEnabled()) { -LOG.info(" - redirect count exceeded " + fit.url); +LOG.info(getName() + " " + Thr
[nutch] branch master updated: NUTCH-2444 HostDB CSV dumper to emit field header by default
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new d7e4046 NUTCH-2444 HostDB CSV dumper to emit field header by default new 3c21a6b Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch d7e4046 is described below commit d7e4046e6e725ed759d0c43e37c51c5c3122e006 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Mon Oct 23 15:11:17 2017 +0200 NUTCH-2444 HostDB CSV dumper to emit field header by default --- src/java/org/apache/nutch/hostdb/ReadHostDb.java | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index 54649e4..28a7eb7 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -62,6 +62,7 @@ public class ReadHostDb extends Configured implements Tool { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + public static final String HOSTDB_DUMP_HEADER = "hostdb.dump.field.header"; public static final String HOSTDB_DUMP_HOSTNAMES = "hostdb.dump.hostnames"; public static final String HOSTDB_DUMP_HOMEPAGES = "hostdb.dump.homepages"; public static final String HOSTDB_FILTER_EXPRESSION = "hostdb.filter.expression"; @@ -69,12 +70,14 @@ public class ReadHostDb extends Configured implements Tool { static class ReadHostDbMapper extends Mapper<Text, HostDatum, Text, Text> { protected boolean dumpHostnames = false; protected boolean dumpHomepages = false; +protected boolean fieldHeader = true; protected Text emptyText = new Text(); protected Expression expr = null; public void setup(Context context) { dumpHomepages = context.getConfiguration().getBoolean(HOSTDB_DUMP_HOMEPAGES, false); dumpHostnames = context.getConfiguration().getBoolean(HOSTDB_DUMP_HOSTNAMES, false); + fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, true); String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION); if (expr != null) { // Create or retrieve a JexlEngine @@ -89,7 +92,12 @@ public class ReadHostDb extends Configured implements Tool { } } -public void map(Text key, HostDatum datum, Context context) throws IOException, InterruptedException { +public void map(Text key, HostDatum datum, Context context) throws IOException, InterruptedException { + if (fieldHeader && !dumpHomepages && !dumpHostnames) { +context.write(new Text("hostname"), new Text("unfetched\tfetched\tgone\tredirTemp\tredirPerm\tredirSum\tok\tnumRecords\tdnsFail\tcnxFail\tsumFail\tscore\tlastCheck\thomepage\tmetadata")); +fieldHeader = false; + } + if (expr != null) { // Create a context and add data JexlContext jcontext = new MapContext(); -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
[nutch] branch master updated: NUTCH-2367 Get single record from HostDB
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new be3aea1 NUTCH-2367 Get single record from HostDB be3aea1 is described below commit be3aea1410835b34cfacdff7c3def9fb01a83e76 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Thu Mar 16 11:40:02 2017 +0100 NUTCH-2367 Get single record from HostDB --- src/java/org/apache/nutch/hostdb/ReadHostDb.java | 39 ++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index 5b08504..17e135a 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -30,9 +30,11 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; @@ -200,6 +202,29 @@ public class ReadHostDb extends Configured implements Tool { long end = System.currentTimeMillis(); LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } + + private void getHostDbRecord(Path hostDb, String host) throws Exception { +Configuration conf = getConf(); +SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(conf, hostDb); + +Class keyClass = readers[0].getKeyClass(); +Class valueClass = readers[0].getValueClass(); + +if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) + throw new IOException("Incompatible key (" + keyClass.getName() + ")"); + +Text key = (Text) keyClass.newInstance(); +HostDatum value = (HostDatum) valueClass.newInstance(); + +for (int i = 0; i < readers.length; i++) { + while (readers[i].next(key, value)) { +if (host.equals(key.toString())) { + System.out.println(value.toString()); +} + } + readers[i].close(); +} + } public static void main(String args[]) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new ReadHostDb(), args); @@ -208,13 +233,14 @@ public class ReadHostDb extends Configured implements Tool { public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: ReadHostDb [-dumpHomepages | -dumpHostnames | -expr ]"); + System.err.println("Usage: ReadHostDb [-get ] [ [-dumpHomepages | -dumpHostnames | -expr ]]"); return -1; } boolean dumpHomepages = false; boolean dumpHostnames = false; String expr = null; +String get = null; for (int i = 0; i < args.length; i++) { if (args[i].equals("-dumpHomepages")) { @@ -225,6 +251,11 @@ public class ReadHostDb extends Configured implements Tool { LOG.info("ReadHostDb: dumping hostnames"); dumpHostnames = true; } + if (args[i].equals("-get")) { +get = args[i + 1]; +LOG.info("ReadHostDb: get: "+ get); +i++; + } if (args[i].equals("-expr")) { expr = args[i + 1]; LOG.info("ReadHostDb: evaluating expression: " + expr); @@ -233,7 +264,11 @@ public class ReadHostDb extends Configured implements Tool { } try { - readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr); + if (get != null) { +getHostDbRecord(new Path(args[0], "current"), get); + } else { +readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr); + } return 0; } catch (Exception e) { LOG.error("ReadHostDb: " + StringUtils.stringifyException(e)); -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
[nutch] branch master updated: NUTCH-2366 Deprecated Job constructor in hostdb/ReadHostDb.java\
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 3926910 NUTCH-2366 Deprecated Job constructor in hostdb/ReadHostDb.java\ 3926910 is described below commit 3926910e145df083ec9d42cd397c0cbd9b3a16da Author: Markus Jelsma <mar...@apache.org> AuthorDate: Wed Mar 15 13:04:25 2017 +0100 NUTCH-2366 Deprecated Job constructor in hostdb/ReadHostDb.java\ --- src/java/org/apache/nutch/hostdb/ReadHostDb.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index ab3ec0c..5b08504 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -173,7 +173,8 @@ public class ReadHostDb extends Configured implements Tool { conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); conf.set("mapred.textoutputformat.separator", "\t"); -Job job = new Job(conf, "ReadHostDb"); +Job job = Job.getInstance(conf); +job.setJobName("ReadHostDb"); job.setJarByClass(ReadHostDb.class); FileInputFormat.addInputPath(job, new Path(hostDb, "current")); @@ -239,4 +240,4 @@ public class ReadHostDb extends Configured implements Tool { return -1; } } -} \ No newline at end of file +} -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
[nutch] branch master updated: remove test again
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 6d47e14 remove test again 6d47e14 is described below commit 6d47e14352540bdd0f0630e7a2aa0967f08122bc Author: Markus Jelsma <mar...@apache.org> AuthorDate: Wed Mar 15 12:59:21 2017 +0100 remove test again --- test | 0 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/test b/test deleted file mode 100644 index e69de29..000 -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
[nutch] branch master updated: test markus using git box
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 7143a4c test markus using git box 7143a4c is described below commit 7143a4c68f52905537a6f22c8b2d46cb7610e238 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Wed Mar 15 12:58:15 2017 +0100 test markus using git box --- test | 0 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/test b/test new file mode 100644 index 000..e69de29 -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].
nutch git commit: NUTCH-2359 Parsefilter-regex raises IndexOutOfBoundsException when rules are ill-formed
Repository: nutch Updated Branches: refs/heads/master 76aedcb78 -> 9a9c4b32b NUTCH-2359 Parsefilter-regex raises IndexOutOfBoundsException when rules are ill-formed Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9a9c4b32 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9a9c4b32 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9a9c4b32 Branch: refs/heads/master Commit: 9a9c4b32b9c1ab9c47583a217665e4694272d58a Parents: 76aedcb Author: Markus Jelsma <mar...@apache.org> Authored: Tue Feb 14 14:15:32 2017 +0100 Committer: Markus Jelsma <mar...@apache.org> Committed: Tue Feb 14 14:15:32 2017 +0100 -- src/plugin/parsefilter-regex/README.txt | 41 .../parsefilter/regex/RegexParseFilter.java | 18 + 2 files changed, 52 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/README.txt -- diff --git a/src/plugin/parsefilter-regex/README.txt b/src/plugin/parsefilter-regex/README.txt new file mode 100644 index 000..1fac05f --- /dev/null +++ b/src/plugin/parsefilter-regex/README.txt @@ -0,0 +1,41 @@ +Parsefilter-regex plugin + +Allow parsing and set custom defined fields using regex. Rules can be defined +in a separate rule file or in the nutch configuration. + +If a rule file is used, should create a text file regex-parsefilter.txt (which +is the default name of the rules file). To use a different filename, either +update the file value in pluginâs build.xml or add parsefilter.regex.file +config to the nutch config. + +ie: + + parsefilter.regex.file + + /path/to/rulefile + +\t\t\n + +ie: + my_first_field htmlh1 + my_second_field textmy_pattern + + +If a rule file is not used, rules can be directly set in the nutch config: + +ie: + + parsefilter.regex.rules + + my_first_field htmlh1 + my_second_field textmy_pattern + +http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java -- diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java index 6955166..f799e5f 100644 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java +++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java @@ -179,13 +179,17 @@ public class RegexParseFilter implements HtmlParseFilter { while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { line = line.trim(); -String[] parts = line.split("\t"); - -String field = parts[0].trim(); -String source = parts[1].trim(); -String regex = parts[2].trim(); - -rules.put(field, new RegexRule(source, regex)); +String[] parts = line.split("\\s"); + +if (parts.length == 3) { +String field = parts[0].trim(); +String source = parts[1].trim(); +String regex = parts[2].trim(); + +rules.put(field, new RegexRule(source, regex)); +} else { +LOG.info("RegexParseFilter rule is invalid. " + line); +} } } }
nutch git commit: revert 2320
Repository: nutch Updated Branches: refs/heads/master 836b2e01d -> d4c924e56 revert 2320 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d4c924e5 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d4c924e5 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d4c924e5 Branch: refs/heads/master Commit: d4c924e56030d6b1fa3b115686e80c8cf516db61 Parents: 836b2e0 Author: Markus Jelsma <mar...@apache.org> Authored: Thu Oct 6 10:56:50 2016 +0200 Committer: Markus Jelsma <mar...@apache.org> Committed: Thu Oct 6 10:56:50 2016 +0200 -- .../org/apache/nutch/net/URLFilterChecker.java | 181 ++- 1 file changed, 59 insertions(+), 122 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/d4c924e5/src/java/org/apache/nutch/net/URLFilterChecker.java -- diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java index 86b91e2..89a3d00 100644 --- a/src/java/org/apache/nutch/net/URLFilterChecker.java +++ b/src/java/org/apache/nutch/net/URLFilterChecker.java @@ -17,27 +17,16 @@ package org.apache.nutch.net; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.PrintWriter; -import java.net.ServerSocket; -import java.net.Socket; -import java.net.InetSocketAddress; -import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.ExtensionPoint; import org.apache.nutch.plugin.PluginRepository; -import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.BufferedReader; +import java.io.InputStreamReader; /** * Checks one given filter or all filters. @@ -47,118 +36,62 @@ import org.slf4j.LoggerFactory; public class URLFilterChecker { private Configuration conf; - private static String filterName = null; - protected static boolean keepClientCnxOpen = false; - protected static int tcpPort = -1; - protected URLFilters filters = null; - - public static final Logger LOG = LoggerFactory - .getLogger(URLFilterChecker.class); - + public URLFilterChecker(Configuration conf) { -System.out.println("Checking combination of all URLFilters available"); this.conf = conf; -if (filterName != null) { -this.conf.set("plugin.includes", filterName); -} -filters = new URLFilters(this.conf); } - - public void run() throws Exception { -// In listening mode? -if (tcpPort == -1) { - // No, just fetch and display - checkStdin(); -} else { - // Listen on socket and start workers on incoming requests - listen(); -} - } - - private void listen() throws Exception { -ServerSocket server = null; - -try{ - server = new ServerSocket(); - server.bind(new InetSocketAddress(tcpPort)); - LOG.info(server.toString()); -} catch (Exception e) { - LOG.error("Could not listen on port " + tcpPort); - System.exit(-1); -} - -while(true){ - Worker worker; - try{ -worker = new Worker(server.accept()); -Thread thread = new Thread(worker); -thread.start(); - } catch (Exception e) { -LOG.error("Accept failed: " + tcpPort); -System.exit(-1); - } -} - } - - private class Worker implements Runnable { -private Socket client; -Worker(Socket client) { - this.client = client; - LOG.info(client.toString()); -} + private void checkOne(String filterName) throws Exception { +URLFilter filter = null; + +ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( +URLFilter.X_POINT_ID); + +if (point == null) + throw new RuntimeException(URLFilter.X_POINT_ID + " not found."); -public void run() { - if (keepClientCnxOpen) { -while (true) { // keep connection open until closes - readWrite(); -} +Extension[] extensions = point.getExtensions(); + +for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + filter = (URLFilter) extension.getExtensionInstance(); + if (filter.getClass().getName().equals(filterName)) { +break; } else { -readWrite(); - -try { // close ourselves - client.close(); -} catch (Exception e){ - LOG.error(e.toString()); -} +filter = null;
nutch git commit: NUTCH-2320 URLFilterChecker to run as TCP Telnet service
Repository: nutch Updated Branches: refs/heads/master e53b34b23 -> 836b2e01d NUTCH-2320 URLFilterChecker to run as TCP Telnet service Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/836b2e01 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/836b2e01 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/836b2e01 Branch: refs/heads/master Commit: 836b2e01d1a4e0e9443601da755ea37de91b8c7d Parents: e53b34b Author: Markus Jelsma <mar...@apache.org> Authored: Wed Oct 5 14:53:05 2016 +0200 Committer: Markus Jelsma <mar...@apache.org> Committed: Wed Oct 5 14:53:05 2016 +0200 -- .../org/apache/nutch/net/URLFilterChecker.java | 181 +-- 1 file changed, 122 insertions(+), 59 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/836b2e01/src/java/org/apache/nutch/net/URLFilterChecker.java -- diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java index 89a3d00..86b91e2 100644 --- a/src/java/org/apache/nutch/net/URLFilterChecker.java +++ b/src/java/org/apache/nutch/net/URLFilterChecker.java @@ -17,16 +17,27 @@ package org.apache.nutch.net; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.net.ServerSocket; +import java.net.Socket; +import java.net.InetSocketAddress; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.ExtensionPoint; import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; - -import java.io.BufferedReader; -import java.io.InputStreamReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Checks one given filter or all filters. @@ -36,62 +47,118 @@ import java.io.InputStreamReader; public class URLFilterChecker { private Configuration conf; - + private static String filterName = null; + protected static boolean keepClientCnxOpen = false; + protected static int tcpPort = -1; + protected URLFilters filters = null; + + public static final Logger LOG = LoggerFactory + .getLogger(URLFilterChecker.class); + public URLFilterChecker(Configuration conf) { +System.out.println("Checking combination of all URLFilters available"); this.conf = conf; +if (filterName != null) { +this.conf.set("plugin.includes", filterName); +} +filters = new URLFilters(this.conf); } + + public void run() throws Exception { +// In listening mode? +if (tcpPort == -1) { + // No, just fetch and display + checkStdin(); +} else { + // Listen on socket and start workers on incoming requests + listen(); +} + } + + private void listen() throws Exception { +ServerSocket server = null; + +try{ + server = new ServerSocket(); + server.bind(new InetSocketAddress(tcpPort)); + LOG.info(server.toString()); +} catch (Exception e) { + LOG.error("Could not listen on port " + tcpPort); + System.exit(-1); +} + +while(true){ + Worker worker; + try{ +worker = new Worker(server.accept()); +Thread thread = new Thread(worker); +thread.start(); + } catch (Exception e) { +LOG.error("Accept failed: " + tcpPort); +System.exit(-1); + } +} + } + + private class Worker implements Runnable { +private Socket client; - private void checkOne(String filterName) throws Exception { -URLFilter filter = null; - -ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( -URLFilter.X_POINT_ID); - -if (point == null) - throw new RuntimeException(URLFilter.X_POINT_ID + " not found."); - -Extension[] extensions = point.getExtensions(); +Worker(Socket client) { + this.client = client; + LOG.info(client.toString()); +} -for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - filter = (URLFilter) extension.getExtensionInstance(); - if (filter.getClass().getName().equals(filterName)) { -break; +public void run() { + if (keepClientCnxOpen) { +while (true) { // keep connection open until closes + readWrite(); +} } else { -filter = null; +readWrite(); + +try { // close ourselves + client.close(); +} catch (Exception e){ + LOG.error
nutch git commit: NUTCH-2272 Index checker server to optionally keep client connection open
Repository: nutch Updated Branches: refs/heads/master 7956daee8 -> beb48a84b NUTCH-2272 Index checker server to optionally keep client connection open Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/beb48a84 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/beb48a84 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/beb48a84 Branch: refs/heads/master Commit: beb48a84b2be52f92af24956ae59286ad116913c Parents: 7956dae Author: Markus Jelsma <mar...@apache.org> Authored: Fri Jun 3 15:02:12 2016 +0200 Committer: Markus Jelsma <mar...@apache.org> Committed: Fri Jun 3 15:02:12 2016 +0200 -- CHANGES.txt | 1 + .../nutch/indexer/IndexingFiltersChecker.java | 35 ++-- 2 files changed, 25 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/beb48a84/CHANGES.txt -- diff --git a/CHANGES.txt b/CHANGES.txt index ffcf5ae..877f23b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -37,6 +37,7 @@ Bug Improvement +[NUTCH-2272] - Index checker server to optionally keep client connection open [NUTCH-1233] - Rely on Tika for outlink extraction [NUTCH-1712] - Use MultipleInputs in Injector to make it a single mapreduce job [NUTCH-2172] - index-more: document format of contenttype-mapping.txt http://git-wip-us.apache.org/repos/asf/nutch/blob/beb48a84/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java -- diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java index da4123f..2e1b9c2 100644 --- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java +++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java @@ -69,6 +69,7 @@ public class IndexingFiltersChecker extends Configured implements Tool { protected URLNormalizers normalizers = null; protected boolean dumpText = false; protected boolean followRedirects = false; + protected boolean keepClientCnxOpen = false; // used to simulate the metadata propagated from injection protected HashMap<String, String> metadata = new HashMap<String, String>(); protected int tcpPort = -1; @@ -82,7 +83,7 @@ public class IndexingFiltersChecker extends Configured implements Tool { public int run(String[] args) throws Exception { String url = null; -String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] [-listen ] "; +String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] [-listen ] [-keepClientCnxOpen]"; if (args.length == 0) { System.err.println(usage); @@ -96,6 +97,8 @@ public class IndexingFiltersChecker extends Configured implements Tool { tcpPort = Integer.parseInt(args[++i]); } else if (args[i].equals("-followRedirects")) { followRedirects = true; + } else if (args[i].equals("-keepClientCnxOpen")) { +keepClientCnxOpen = true; } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (args[i].equals("-md")) { @@ -164,7 +167,23 @@ public class IndexingFiltersChecker extends Configured implements Tool { LOG.info(client.toString()); } -public void run(){ +public void run() { + if (keepClientCnxOpen) { +while (true) { // keep connection open until closes + readWrite(); +} + } else { +readWrite(); + +try { // close ourselves + client.close(); +} catch (Exception e){ + LOG.error(e.toString()); +} + } +} + +protected void readWrite() { String line; BufferedReader in = null; PrintWriter out = null; @@ -185,14 +204,6 @@ public class IndexingFiltersChecker extends Configured implements Tool { }catch (Exception e) { LOG.error("Read/Write failed: " + e); } - - try { -client.close(); - } catch (Exception e){ -LOG.error(e.toString()); - } - - return; } } @@ -331,6 +342,8 @@ public class IndexingFiltersChecker extends Configured implements Tool { } } } + +output.append("\n"); // For readability if keepClientCnxOpen if (getConf().getBoolean("doIndex", false) && doc != null) { IndexWriters writers = new IndexWriters(getConf()); @@ -355,4 +368,4 @@ public class IndexingFiltersChecker extends Configured implements Tool { new IndexingFiltersChecker(), args); System.exit(res); } -} +} \ No newline at end of file
svn commit: r1732332 - /nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java
Author: markus Date: Thu Feb 25 16:44:18 2016 New Revision: 1732332 URL: http://svn.apache.org/viewvc?rev=1732332=rev Log: NUTCH-2231 Jexl support in generator job Modified: nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java Modified: nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java?rev=1732332=1732331=1732332=diff == --- nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java Thu Feb 25 16:44:18 2016 @@ -47,6 +47,8 @@ public class JexlUtil { * @return parsed Jexl expression or null in case of parse error */ public static Expression parseExpression(String expr) { +if (expr == null) return null; + try { // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z Matcher matcher = datePattern.matcher(expr);
svn commit: r1732177 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReader.java src/java/org/apache/nutch/crawl/Generator.java sr
Author: markus Date: Wed Feb 24 15:51:21 2016 New Revision: 1732177 URL: http://svn.apache.org/viewvc?rev=1732177=rev Log: NUTCH-2231 Jexl support in generator job Added: nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732177=1732176=1732177=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 24 15:51:21 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2231 Jexl support in generator job (markus) + * NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus) * NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732177=1732176=1732177=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 15:51:21 2016 @@ -534,7 +534,7 @@ public class CrawlDatum implements Writa jcontext.set("interval", new Integer(getFetchInterval())); jcontext.set("score", getScore()); jcontext.set("signature", StringUtil.toHexString(getSignature())); - + // Set metadata variables for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) { Object value = entry.getValue(); @@ -553,15 +553,11 @@ public class CrawlDatum implements Writa if (value instanceof Text) { Text tvalue = (Text)value; - Text tkey = (Text)entry.getKey(); - - try { -Float number = Float.parseFloat(tvalue.toString()); -jcontext.set(tkey.toString(), number); - } catch (Exception e) {} + Text tkey = (Text)entry.getKey(); + jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString()); } } - + try { if (Boolean.TRUE.equals(expr.evaluate(jcontext))) { return true; Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732177=1732176=1732177=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 15:51:21 2016 @@ -65,6 +65,7 @@ import org.apache.hadoop.mapred.lib.Iden import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.util.JexlUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; @@ -508,8 +509,10 @@ public class CrawlDbReader extends Confi job.set("regex", regex); if (retry != null) job.setInt("retry", retry); -if (expr != null) +if (expr != null) { job.set("expr", expr); + LOG.info("CrawlDb db: expr: " + expr); +} job.setMapperClass(CrawlDbDumpMapper.class); job.setOutputKeyClass(Text.class); @@ -523,7 +526,6 @@ public class CrawlDbReader extends Confi public static class CrawlDbDumpMapper implements Mapper<Text, CrawlDatum, Text, CrawlDatum> { -Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"); Pattern pattern = null; Matcher matcher = null; String status = null; @@ -536,30 +538,9 @@ public class CrawlDbReader extends Confi } status = job.get("status", null); retry = job.getInt("retry", -1); - String exprStr = job.get("expr", null); if (job.get("expr", null) != null) { -try { - // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z - Matcher matcher = datePattern.matcher(exprStr); - if (matcher.find()) { -String date = matcher.group(); - -// Parse the thing and get epoch! -Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"-MM-dd'T'HH:mm:ss'Z'"}); -
svn commit: r1732160 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java
Author: markus Date: Wed Feb 24 14:12:42 2016 New Revision: 1732160 URL: http://svn.apache.org/viewvc?rev=1732160=rev Log: NUTCH-2232 DeduplicationJob should decode URL's before length is compared Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732160=1732159=1732160=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 24 14:12:42 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus) + * NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus) * NUTCH-2227 RegexParseFilter (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1732160=1732159=1732160=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Wed Feb 24 14:12:42 2016 @@ -17,6 +17,8 @@ package org.apache.nutch.crawl; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Iterator; @@ -193,8 +195,15 @@ public class DeduplicationJob extends Nu break; case "urlLength": // same time? keep the one which has the shortest URL - String urlExisting = existingDoc.getMetaData().get(urlKey).toString(); - String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); + String urlExisting; + String urlnewDoc; + try { +urlExisting = URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8"); +urlnewDoc = URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8"); + } catch (UnsupportedEncodingException e) { +LOG.error("Error decoding: " + urlKey); +throw new IOException("UnsupportedEncodingException for " + urlKey); + } if (urlExisting.length() < urlnewDoc.length()) { // mark new one as duplicate writeOutAsDuplicate(newDoc, output, reporter);
svn commit: r1732140 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReader.java
Author: markus Date: Wed Feb 24 13:05:02 2016 New Revision: 1732140 URL: http://svn.apache.org/viewvc?rev=1732140=rev Log: NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732140=1732139=1732140=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 24 13:05:02 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus) + * NUTCH-2227 RegexParseFilter (markus) * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732140=1732139=1732140=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 13:05:02 2016 @@ -521,30 +521,20 @@ public class CrawlDatum implements Writa } } - public boolean evaluate(String expr) { -return evaluate(expr, true, true); - } - - public boolean evaluate(String expr, boolean silent, boolean strict) { -if (expr != null) { - // Create or retrieve a JexlEngine - JexlEngine jexl = new JexlEngine(); - - jexl.setSilent(silent); - jexl.setStrict(strict); - - // Create an expression object and evaluate - return evaluate(jexl.createExpression(expr)); -} - -return false; - } - public boolean evaluate(Expression expr) { if (expr != null) { // Create a context and add data JexlContext jcontext = new MapContext(); - + + // https://issues.apache.org/jira/browse/NUTCH-2229 + jcontext.set("status", getStatusName(getStatus())); + jcontext.set("fetchTime", (long)(getFetchTime())); + jcontext.set("modifiedTime", (long)(getModifiedTime())); + jcontext.set("retries", getRetriesSinceFetch()); + jcontext.set("interval", new Integer(getFetchInterval())); + jcontext.set("score", getScore()); + jcontext.set("signature", StringUtil.toHexString(getSignature())); + // Set metadata variables for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) { Object value = entry.getValue(); @@ -571,7 +561,7 @@ public class CrawlDatum implements Writa } catch (Exception e) {} } } - + try { if (Boolean.TRUE.equals(expr.evaluate(jcontext))) { return true; Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732140=1732139=1732140=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 13:05:02 2016 @@ -70,6 +70,7 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; +import org.apache.commons.lang.time.DateUtils; /** * Read utility for the CrawlDB. @@ -522,6 +523,7 @@ public class CrawlDbReader extends Confi public static class CrawlDbDumpMapper implements Mapper<Text, CrawlDatum, Text, CrawlDatum> { +Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"); Pattern pattern = null; Matcher matcher = null; String status = null; @@ -534,12 +536,30 @@ public class CrawlDbReader extends Confi } status = job.get("status", null); retry = job.getInt("retry", -1); - + String exprStr = job.get("expr", null); + if (job.get("expr", null) != null) { -JexlEngine jexl = new JexlEngine(); -jexl.setSilent(true); -jexl.setStrict(true); -expr = jexl.createExpression(job.get("expr", null)); +try { + // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z + Matcher matcher = datePattern.matcher(exprStr); + if (matcher.find()) { +String date = matcher.group(); + +// Parse the thing and get epoch! +
svn commit: r1731849 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/parsefilter-regex/ src/plugin/parsefilter-regex/data/ src/plugin/parsefilter-regex/src/ src/plugin/parsefilter-regex/src/java/ s
Author: markus Date: Tue Feb 23 12:58:54 2016 New Revision: 1731849 URL: http://svn.apache.org/viewvc?rev=1731849=rev Log: NUTCH-2227 RegexParseFilter Added: nutch/trunk/conf/regex-parsefilter.txt nutch/trunk/src/plugin/parsefilter-regex/ nutch/trunk/src/plugin/parsefilter-regex/build.xml nutch/trunk/src/plugin/parsefilter-regex/data/ nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt nutch/trunk/src/plugin/parsefilter-regex/ivy.xml nutch/trunk/src/plugin/parsefilter-regex/plugin.xml nutch/trunk/src/plugin/parsefilter-regex/src/ nutch/trunk/src/plugin/parsefilter-regex/src/java/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/ nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java nutch/trunk/src/plugin/parsefilter-regex/src/test/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/ nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml nutch/trunk/default.properties nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731849=1731848=1731849=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 23 12:58:54 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2227 RegexParseFilter (markus) + * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus) * NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1731849=1731848=1731849=diff == --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Tue Feb 23 12:58:54 2016 @@ -200,6 +200,7 @@ + @@ -637,6 +638,7 @@ + @@ -1048,6 +1050,8 @@ + + Added: nutch/trunk/conf/regex-parsefilter.txt URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-parsefilter.txt?rev=1731849=auto == --- nutch/trunk/conf/regex-parsefilter.txt (added) +++ nutch/trunk/conf/regex-parsefilter.txt Tue Feb 23 12:58:54 2016 @@ -0,0 +1,8 @@ +# Example configuration file for parsefilter-regex +# +# Parse metadata field is set to true if the HTML matches the regex. The +# source can either be html or text. If source is html, the regex is applied to +# the entire HTML tree. If source is text, the regex is applied to the +# extracted text. +# +# format: \t\t\n Modified: nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1731849=1731848=1731849=diff == --- nutch/trunk/default.properties (original) +++ nutch/trunk/default.properties Tue Feb 23 12:58:54 2016 @@ -143,6 +143,7 @@ plugins.parse=\ plugins.parsefilter=\ org.apache.nutch.parse.headings*:\ org.apache.nutch.parsefilter.naivebayes*:\ + org.apache.nutch.parsefilter.regex*:\ org.apache.nutch.parse.metatags* # Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1731849=1731848=1731849=diff == --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Tue Feb 23 12:58:54 2016 @@ -77,6 +77,7 @@ + @@ -114,6 +115,7 @@ + @@ -176,6 +178,7 @@ + Added: nutch/trunk/src/plugin/parsefilter-regex/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/build.xml?rev=1731849=auto
svn commit: r1731836 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/FetcherThread.java src/java/org/apache/nutch/parse/ParseOutputFormat.java
Author: markus Date: Tue Feb 23 10:38:31 2016 New Revision: 1731836 URL: http://svn.apache.org/viewvc?rev=1731836=rev Log: NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731836=1731835=1731836=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 23 10:38:31 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus) + * NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus) * NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1731836=1731835=1731836=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Feb 23 10:38:31 2016 @@ -538,6 +538,16 @@ + db.ignore.internal.links + false + If true, outlinks leading from a page to internal hosts or domain + will be ignored. This is an effective way to limit the crawl to include + only initially injected hosts, without creating complex URLFilters. + See 'db.ignore.external.links.mode'. + + + + db.ignore.external.links false If true, outlinks leading from a page to external hosts or domain Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1731836=1731835=1731836=diff == --- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original) +++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Tue Feb 23 10:38:31 2016 @@ -84,6 +84,7 @@ public class FetcherThread extends Threa private String reprUrl; private boolean redirecting; private int redirectCount; + private boolean ignoreInternalLinks; private boolean ignoreExternalLinks; private String ignoreExternalLinksMode; @@ -174,6 +175,7 @@ public class FetcherThread extends Threa maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; interval = conf.getInt("db.fetch.interval.default", 2592000); +ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false); ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false); ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost"); maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1); @@ -428,10 +430,10 @@ public class FetcherThread extends Threa newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); -if (ignoreExternalLinks) { - try { -String origHost = new URL(urlString).getHost().toLowerCase(); -String newHost = new URL(newUrl).getHost().toLowerCase(); +try { + String origHost = new URL(urlString).getHost().toLowerCase(); + String newHost = new URL(newUrl).getHost().toLowerCase(); + if (ignoreExternalLinks) { if (!origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug(" - ignoring redirect " + redirType + " from " @@ -440,10 +442,20 @@ public class FetcherThread extends Threa } return null; } - } catch (MalformedURLException e) { } -} - + + if (ignoreInternalLinks) { +if (origHost.equals(newHost)) { + if (LOG.isDebugEnabled()) { +LOG.debug(" - ignoring redirect " + redirType + " from " ++ urlString + " to " + newUrl ++ " because internal links are ignored"); + } + return null; +} + } +} catch (MalformedURLException e) { } + if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); @@ -621,7 +633,7 @@ public class FetcherThread extends Threa // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); - if (ignoreExternalLinks) { + if (ignoreExternalLinks || ignoreInternalLinks) { URL originU
svn commit: r1731831 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/LinkDb.java src/java/org/apache/nutch/crawl/LinkDbMerger.java
Author: markus Date: Tue Feb 23 10:23:24 2016 New Revision: 1731831 URL: http://svn.apache.org/viewvc?rev=1731831=rev Log: NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731831=1731830=1731831=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 23 10:23:24 2016 @@ -1,5 +1,17 @@ +Fellow committers, Nutch 1.12 contains a breaking change NUTCH-2220. Please use the note below and +in the release announcement and keep it on top in this CHANGES.txt for the Nutch 1.12 release. + +* replace your old conf/nutch-default.xml with the conf/nutch-default.xml from Nutch 1.12 release +* if you use LinkDB (e.g. invertlinks) and modified parameters db.max.inlinks and/or db.max.anchor.length + and/or db.ignore.internal.links, rename those parameters to linkdb.max.inlinks and + linkdb.max.anchor.length and linkdb.ignore.internal.links +* db.ignore.internal.links and db.ignore.external.links now operate on the CrawlDB only +* linkdb.ignore.internal.links and linkdb.ignore.external.links now operate on the LinkDB only + Nutch Change Log +* NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus) + * NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via markus) * NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van der Vegt via markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1731831=1731830=1731831=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Feb 23 10:23:24 2016 @@ -538,16 +538,6 @@ - db.ignore.internal.links - true - If true, when adding new links to a page, links from - the same host are ignored. This is an effective way to limit the - size of the link database, keeping only the highest quality - links. - - - - db.ignore.external.links false If true, outlinks leading from a page to external hosts or domain @@ -616,15 +606,6 @@ - db.max.inlinks - 1 - Maximum number of Inlinks per URL to be kept in LinkDb. - If "invertlinks" finds more inlinks than this number, only the first - N inlinks will be stored, and the rest will be discarded. - - - - db.max.outlinks.per.page 100 The maximum number of outlinks that we'll process for a page. @@ -681,6 +662,35 @@ + + + + linkdb.max.inlinks + 1 + Maximum number of Inlinks per URL to be kept in LinkDb. + If "invertlinks" finds more inlinks than this number, only the first + N inlinks will be stored, and the rest will be discarded. + + + + + linkdb.ignore.internal.links + true + If true, when adding new links to a page, links from + the same host are ignored. This is an effective way to limit the + size of the link database, keeping only the highest quality + links. + + + + + linkdb.ignore.external.links + false + If true, when adding new links to a page, links from + the a different host are ignored. + + + Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1731831=1731830=1731831=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Tue Feb 23 10:23:24 2016 @@ -48,8 +48,8 @@ public class LinkDb extends NutchTool im public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class); - public static final String IGNORE_INTERNAL_LINKS = "db.ignore.internal.links"; - public static final String IGNORE_EXTERNAL_LINKS = "db.ignore.external.links"; + public static final String IGNORE_INTERNAL_LINKS = "linkdb.ignore.internal.links"; + public static final String IGNORE_EXTERNAL_LINKS = "linkdb.ignore.external.links"; public static final String CURRENT_NAME = "current"; public static final String LOCK_NAME = ".locked"; @@ -68,7 +68,7 @@ public class LinkDb extends NutchTool im } public void configure(JobConf job) { -maxAnchorLength = job.getInt("db.max.anchor.length", 100); +maxAnchorLength = job.getInt("linkdb.max.anchor.length", 100); ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true); ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false); Modified: nu
svn commit: r1731824 - in /nutch/trunk: CHANGES.txt src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
Author: markus Date: Tue Feb 23 09:50:05 2016 New Revision: 1731824 URL: http://svn.apache.org/viewvc?rev=1731824=rev Log: NUTCH-2228 Plugin index-replace unit test broken on Java 8 Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731824=1731823=1731824=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 23 09:50:05 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via markus) + * NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van der Vegt via markus) * NUTCH-2218 Update CrawlComplete util to use Commons CLI (Joyce) Modified: nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java?rev=1731824=1731823=1731824=diff == --- nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java (original) +++ nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java Tue Feb 23 09:50:05 2016 @@ -182,7 +182,7 @@ public class TestIndexReplace { String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; String expectedAuthor = "Peter Ciuffetti"; // Contains: invalid pattern, invalid flags, incomplete property -String indexReplaceProperty = " metatag.description=/this\\hplugin/this awesome plugin/\n" +String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete"; Configuration conf = NutchConfiguration.create();
svn commit: r1731651 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java
Author: markus Date: Mon Feb 22 14:41:37 2016 New Revision: 1731651 URL: http://svn.apache.org/viewvc?rev=1731651=rev Log: NUTCH-2219 Criteria order to be configurable in DeduplicationJob Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731651=1731650=1731651=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Feb 22 14:41:37 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van der Vegt via markus) + * NUTCH-2218 Update CrawlComplete util to use Commons CLI (Joyce) * NUTCH-2223 Upgrade xercesImpl to 2.11.0 to fix hang on issue in tika mimetype detection (Tien Nguyen Manh via markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1731651=1731650=1731651=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Mon Feb 22 14:41:37 2016 @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Random; +import java.util.Arrays; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -69,6 +70,7 @@ public class DeduplicationJob extends Nu private final static Text urlKey = new Text("_URLTEMPKEY_"); private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; + private final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order"; public static class DBFilter implements Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> { @@ -128,6 +130,13 @@ public class DeduplicationJob extends Nu public static class DedupReducer implements Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> { +private String[] compareOrder; + +@Override +public void configure(JobConf arg0) { + compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(","); +} + private void writeOutAsDuplicate(CrawlDatum datum, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { @@ -144,6 +153,7 @@ public class DeduplicationJob extends Nu throws IOException { CrawlDatum existingDoc = null; + outerloop: while (values.hasNext()) { if (existingDoc == null) { existingDoc = new CrawlDatum(); @@ -151,48 +161,56 @@ public class DeduplicationJob extends Nu continue; } CrawlDatum newDoc = values.next(); -// compare based on score -if (existingDoc.getScore() < newDoc.getScore()) { - writeOutAsDuplicate(existingDoc, output, reporter); - existingDoc = new CrawlDatum(); - existingDoc.set(newDoc); - continue; -} else if (existingDoc.getScore() > newDoc.getScore()) { - // mark new one as duplicate - writeOutAsDuplicate(newDoc, output, reporter); - continue; -} -// same score? delete the one which is oldest -if (existingDoc.getFetchTime() > newDoc.getFetchTime()) { - // mark new one as duplicate - writeOutAsDuplicate(newDoc, output, reporter); - continue; -} else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) { - // mark existing one as duplicate - writeOutAsDuplicate(existingDoc, output, reporter); - existingDoc = new CrawlDatum(); - existingDoc.set(newDoc); - continue; -} -// same time? keep the one which has the shortest URL -String urlExisting = existingDoc.getMetaData().get(urlKey).toString(); -String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); -if (urlExisting.length() < urlnewDoc.length()) { - // mark new one as duplicate - writeOutAsDuplicate(newDoc, output, reporter); - continue; -} else if (urlExisting.length() > urlnewDoc.length()) { - // mark existing one as duplicate - writeOutAsDuplicate(existingDoc, output, reporter); - existingDoc = new CrawlDatum(); - existingDoc.set(newDoc); - continue; + +for (int i = 0; i < compareOrder.length; i++) { + switch (compareOrder[i]) { +case "score": + // compare based on score + if (existingDoc.getScore() < newDoc.getScore()) { +writeOutAsDuplicate(existingDoc, output, reporte
svn commit: r1730803 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java
Author: markus Date: Wed Feb 17 09:55:27 2016 New Revision: 1730803 URL: http://svn.apache.org/viewvc?rev=1730803=rev Log: NUTCH-2224 Average bytes/second calculated incorrectly in fetcher Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730803=1730802=1730803=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 17 09:55:27 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2224 Average bytes/second calculated incorrectly in fetcher (Tien Nguyen Manh via markus) + * NUTCH-2225 Parsed time calculated incorrectly (Tien Nguyen Manh via markus) * NUTCH-961 Expose Tika's Boilerpipe support (Gabriele Kahlout, Vincent Slot, markus) Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1730803=1730802=1730803=diff == --- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 17 09:55:27 2016 @@ -138,7 +138,7 @@ public class Fetcher extends NutchTool i Long elapsed = new Long((System.currentTimeMillis() - start) / 1000); float avgPagesSec = (float) pages.get() / elapsed.floatValue(); -long avgBytesSec = (bytes.get() / 125l) / elapsed.longValue(); +long avgBytesSec = (bytes.get() / 128l) / elapsed.longValue(); status.append(activeThreads).append(" threads (").append(spinWaiting.get()) .append(" waiting), "); @@ -148,7 +148,7 @@ public class Fetcher extends NutchTool i status.append(String.format("%.2f", avgPagesSec)).append(" pages/s ("); status.append(pagesLastSec).append(" last sec), "); status.append(avgBytesSec).append(" kbits/s (") -.append((bytesLastSec / 125)).append(" last sec)"); +.append((bytesLastSec / 128)).append(" last sec)"); reporter.setStatus(status.toString()); }
svn commit: r1730802 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParseSegment.java
Author: markus Date: Wed Feb 17 09:51:14 2016 New Revision: 1730802 URL: http://svn.apache.org/viewvc?rev=1730802=rev Log: NUTCH-2225 Parsed time calculated incorrectly Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730802=1730801=1730802=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 17 09:51:14 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2225 Parsed time calculated incorrectly (Tien Nguyen Manh via markus) + * NUTCH-961 Expose Tika's Boilerpipe support (Gabriele Kahlout, Vincent Slot, markus) * NUTCH-1233 Rely on Tika for outlink extraction (markus) Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1730802=1730801=1730802=diff == --- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Feb 17 09:51:14 2016 @@ -96,6 +96,7 @@ public class ParseSegment extends NutchT return; } +long start = System.currentTimeMillis(); ParseResult parseResult = null; try { if (parseUtil == null) @@ -112,8 +113,6 @@ public class ParseSegment extends NutchT Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); - long start = System.currentTimeMillis(); - reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
svn commit: r1730687 - in /nutch/trunk: ./ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/
Author: markus Date: Tue Feb 16 13:39:18 2016 New Revision: 1730687 URL: http://svn.apache.org/viewvc?rev=1730687=rev Log: NUTCH-1233 Rely on Tika for outlink extraction Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730687=1730686=1730687=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 16 13:39:18 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-1233 Rely on Tika for outlink extraction (markus) + * NUTCH-2210 Upgrade to Tika 1.12 (markus) * NUTCH-2209 Improved Tokenization for Similarity Scoring plugin (Sujen) Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1730687=1730686=1730687=diff == --- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Tue Feb 16 13:39:18 2016 @@ -355,7 +355,9 @@ class DOMBuilder implements ContentHandl */ public void endElement(String ns, String localName, String name) throws org.xml.sax.SAXException { -m_elemStack.pop(); +if (!m_elemStack.isEmpty()) { + m_elemStack.pop(); +} m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek(); } Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1730687=1730686=1730687=diff == --- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Tue Feb 16 13:39:18 2016 @@ -22,11 +22,14 @@ import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.Outlink; import org.apache.nutch.util.NodeWalker; import org.apache.nutch.util.URLUtil; +import org.apache.tika.sax.Link; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @@ -57,6 +60,7 @@ public class DOMContentUtils { } private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); + private HashSet ignoredTags = new HashSet(); private Configuration conf; public DOMContentUtils(Configuration conf) { @@ -85,6 +89,7 @@ public class DOMContentUtils { // remove unwanted link tags from the linkParams map String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { + ignoredTags.add(ignoreTags[i].toLowerCase()); if (!forceTags.contains(ignoreTags[i])) linkParams.remove(ignoreTags[i]); } @@ -244,7 +249,7 @@ public class DOMContentUtils { } return true; } - + // this only covers a few cases of empty links that are symptomatic // of nekohtml's DOM-fixup process... private boolean shouldThrowAwayLink(Node node, NodeList children, @@ -365,5 +370,33 @@ public class DOMContentUtils { } } } - -} + + // This one is used by NUTCH-1918 + public void getOutlinks(URL base, ArrayList outlinks, List tikaExtractedOutlinks) { +String target = null; +String anchor = null; +boolean noFollow = false; + +for (Link link : tikaExtractedOutlinks) { + target = link.getUri(); + noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false; + anchor = link.getText(); + + if (!ignoredTags.contains(link.getType())) { +if (target != null && !noFollow) { + try { +URL url = URLUtil.resolveURL(base, target); + +// clean the anchor +anchor = anchor.replaceAll("\\s+", " "); +anchor = anchor.trim(); + +outlinks.add(new Outlink(url.toString(), anchor)); + } catch (MalformedURLException e) { +// don't care + } +} + } +} + } +} \ No newline at
svn commit: r1728313 - in /nutch/trunk: ./ src/plugin/indexer-solr/ src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/
Author: markus Date: Wed Feb 3 13:51:10 2016 New Revision: 1728313 URL: http://svn.apache.org/viewvc?rev=1728313=rev Log: NUTCH-2197 Add Solr 5 cloud indexer support Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/indexer-solr/ivy.xml nutch/trunk/src/plugin/indexer-solr/plugin.xml nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1728313=1728312=1728313=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 3 13:51:10 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2197 Add Solr 5 cloud indexer support (Jurian Broertjes via markus) + * NUTCH-2206 Provide example scoring.similarity.stopword.file (sujen) * NUTCH-2204 Remove junit lib from runtime (snagel) Modified: nutch/trunk/src/plugin/indexer-solr/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/ivy.xml?rev=1728313=1728312=1728313=diff == --- nutch/trunk/src/plugin/indexer-solr/ivy.xml (original) +++ nutch/trunk/src/plugin/indexer-solr/ivy.xml Wed Feb 3 13:51:10 2016 @@ -36,9 +36,9 @@ - - - + + + Modified: nutch/trunk/src/plugin/indexer-solr/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/plugin.xml?rev=1728313=1728312=1728313=diff == --- nutch/trunk/src/plugin/indexer-solr/plugin.xml (original) +++ nutch/trunk/src/plugin/indexer-solr/plugin.xml Wed Feb 3 13:51:10 2016 @@ -22,17 +22,16 @@ - - - - - - - - - - - + + + + + + + + + + Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1728313=1728312=1728313=diff == --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Wed Feb 3 13:51:10 2016 @@ -17,7 +17,6 @@ package org.apache.nutch.indexwriter.solr; public interface SolrConstants { - public static final String SOLR_PREFIX = "solr."; public static final String SERVER_URL = SOLR_PREFIX + "server.url"; @@ -31,13 +30,23 @@ public interface SolrConstants { public static final String USERNAME = SOLR_PREFIX + "auth.username"; public static final String PASSWORD = SOLR_PREFIX + "auth.password"; - - public static final String SERVER_TYPE = SOLR_PREFIX + "server.type"; - - public static final String ZOOKEEPER_URL = SOLR_PREFIX + "zookeeper.url"; - - public static final String LOADBALANCE_URLS = SOLR_PREFIX + "loadbalance.urls"; - + + public static final String COLLECTION = SOLR_PREFIX + "collection"; + + public static final String ZOOKEEPER_HOSTS = SOLR_PREFIX + "zookeeper.hosts"; + + public static final String ID_FIELD = "id"; + + public static final String URL_FIELD = "url"; + + public static final String BOOST_FIELD = "boost"; + + public static final String TIMESTAMP_FIELD = "tstamp"; + + public static final String DIGEST_FIELD = "digest"; + + + @Deprecated public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index"; Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1728313=1728312=1728313=diff == --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Wed Feb 3 13:51:10 2016 @@ -17,6 +17,7 @@ package org.apache.nutch.indexwriter.solr; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Date; import java.util.List; @@ -28,2
svn commit: r1725981 - in /nutch/trunk: ./ src/java/org/apache/nutch/scoring/webgraph/
Author: markus Date: Thu Jan 21 15:18:07 2016 New Revision: 1725981 URL: http://svn.apache.org/viewvc?rev=1725981=rev Log: NUTCH-2201 Remove loops program from webgraph package Removed: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1725981=1725980=1725981=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jan 21 15:18:07 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2201 Remove loops program from webgraph package (markus) + * NUTCH-1325 HostDB for Nutch (Gui Forget, markus, tejasp) * NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces (Jurian Broertjes via markus) Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1725981=1725980=1725981=diff == --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Thu Jan 21 15:18:07 2016 @@ -59,7 +59,6 @@ import org.apache.hadoop.mapred.lib.Hash import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; -import org.apache.nutch.scoring.webgraph.Loops.LoopSet; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -246,9 +245,8 @@ public class LinkDumper extends Configur String fromUrl = key.toString(); List outlinks = new ArrayList(); Node node = null; - LoopSet loops = null; - - // loop through all values aggregating outlinks, saving node and loopset + + // loop through all values aggregating outlinks, saving node while (values.hasNext()) { ObjectWritable write = values.next(); Object obj = write.get(); @@ -256,25 +254,16 @@ public class LinkDumper extends Configur node = (Node) obj; } else if (obj instanceof LinkDatum) { outlinks.add(WritableUtils.clone((LinkDatum) obj, conf)); -} else if (obj instanceof LoopSet) { - loops = (LoopSet) obj; } } // only collect if there are outlinks int numOutlinks = node.getNumOutlinks(); if (numOutlinks > 0) { - -Set loopSet = (loops != null) ? loops.getLoopSet() : null; for (int i = 0; i < outlinks.size(); i++) { LinkDatum outlink = outlinks.get(i); String toUrl = outlink.getUrl(); - // remove any url that is in the loopset, same as LinkRank - if (loopSet != null && loopSet.contains(toUrl)) { -continue; - } - // collect the outlink as an inlink with the node output.collect(new Text(toUrl), new LinkNode(fromUrl, node)); } @@ -343,8 +332,6 @@ public class LinkDumper extends Configur Path linkdump = new Path(webGraphDb, DUMP_DIR); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); -Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR); -boolean loopsExists = fs.exists(loopSetDb); Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); // run the inverter job @@ -353,9 +340,6 @@ public class LinkDumper extends Configur JobConf inverter = new NutchJob(conf); inverter.setJobName("LinkDumper: inverter"); FileInputFormat.addInputPath(inverter, nodeDb); -if (loopsExists) { - FileInputFormat.addInputPath(inverter, loopSetDb); -} FileInputFormat.addInputPath(inverter, outlinkDb); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1725981=1725980=1725981=diff == --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Thu Jan 21 15:18:07 2016 @@ -61,7 +61,6 @@ import org.apache.hadoop.mapred.TextOutp import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; -import org.apache.nutch.scoring.webgraph.Loops.LoopSet; import org.apache.nut
svn commit: r1725538 - in /nutch/trunk: CHANGES.txt src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
Author: markus Date: Tue Jan 19 14:53:05 2016 New Revision: 1725538 URL: http://svn.apache.org/viewvc?rev=1725538=rev Log: NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1725538=1725537=1725538=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jan 19 14:53:05 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces (Jurian Broertjes via markus) + * NUTCH-2194 Run IndexingFilterChecker as simple Telnet server (markus) * NUTCH-2196 IndexingFilterChecker to optionally normalize (markus) Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1725538=1725537=1725538=diff == --- nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Tue Jan 19 14:53:05 2016 @@ -196,6 +196,7 @@ public class SuffixURLFilter implements String line; while ((line = in.readLine()) != null) { + line = line.trim(); if (line.length() == 0) continue;
svn commit: r1724771 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Author: markus Date: Fri Jan 15 10:45:27 2016 New Revision: 1724771 URL: http://svn.apache.org/viewvc?rev=1724771=rev Log: NUTCH-2194 Run IndexingFilterChecker as simple Telnet server Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724771=1724770=1724771=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 15 10:45:27 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2194 Run IndexingFilterChecker as simple Telnet server (markus) + * NUTCH-2196 IndexingFilterChecker to optionally normalize (markus) * NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724771=1724770=1724771=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Fri Jan 15 10:45:27 2016 @@ -17,6 +17,13 @@ package org.apache.nutch.indexer; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.net.ServerSocket; +import java.net.Socket; +import java.net.InetSocketAddress; +import java.nio.charset.Charset; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -59,6 +66,13 @@ import org.slf4j.LoggerFactory; public class IndexingFiltersChecker extends Configured implements Tool { + protected URLNormalizers normalizers = null; + protected boolean dumpText = false; + protected boolean followRedirects = false; + // used to simulate the metadata propagated from injection + protected HashMap<String, String> metadata = new HashMap<String, String>(); + protected int tcpPort = -1; + public static final Logger LOG = LoggerFactory .getLogger(IndexingFiltersChecker.class); @@ -67,25 +81,19 @@ public class IndexingFiltersChecker exte } public int run(String[] args) throws Exception { -String contentType = null; String url = null; -URLNormalizers normalizers = null; -boolean dumpText = false; -boolean followRedirects = false; - -String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] "; +String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] [-listen ] "; if (args.length == 0) { System.err.println(usage); return -1; } -// used to simulate the metadata propagated from injection -HashMap<String, String> metadata = new HashMap<String, String>(); - for (int i = 0; i < args.length; i++) { if (args[i].equals("-normalize")) { normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); + } else if (args[i].equals("-listen")) { +tcpPort = Integer.parseInt(args[++i]); } else if (args[i].equals("-followRedirects")) { followRedirects = true; } else if (args[i].equals("-dumpText")) { @@ -108,6 +116,88 @@ public class IndexingFiltersChecker exte } } +// In listening mode? +if (tcpPort == -1) { + // No, just fetch and display + StringBuilder output = new StringBuilder(); + int ret = fetch(url, output); + System.out.println(output); + return ret; +} else { + // Listen on socket and start workers on incoming requests + listen(); +} + +return 0; + } + + protected void listen() throws Exception { +ServerSocket server = null; + +try{ + server = new ServerSocket(); + server.bind(new InetSocketAddress(tcpPort)); + LOG.info(server.toString()); +} catch (Exception e) { + LOG.error("Could not listen on port " + tcpPort); + System.exit(-1); +} + +while(true){ + Worker worker; + try{ +worker = new Worker(server.accept()); +Thread thread = new Thread(worker); +thread.start(); + } catch (Exception e) { +LOG.error("Accept failed: " + tcpPort); +System.exit(-1); + } +} + } + + private class Worker implements Runnable { +private Socket client; + +Worker(Socket client) { + this.client = client; + LOG.info(client.toString()); +} + +public void run(){ + String line; + BufferedReader in = null; + PrintWriter out = null; + + try{ +in = new BufferedReader(new InputStreamReader(client.getInpu
svn commit: r1724418 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Author: markus Date: Wed Jan 13 13:10:19 2016 New Revision: 1724418 URL: http://svn.apache.org/viewvc?rev=1724418=rev Log: NUTCH-2196 IndexingFilterChecker to optionally normalize Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724418=1724417=1724418=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 13 13:10:19 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2196 IndexingFilterChecker to optionally normalize (markus) + * NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus) * NUTCH-2190 Protocol normalizer (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724418=1724417=1724418=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 13:10:19 2016 @@ -32,6 +32,7 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseSegment; @@ -43,7 +44,6 @@ import org.apache.nutch.protocol.Protoco import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,10 +69,11 @@ public class IndexingFiltersChecker exte public int run(String[] args) throws Exception { String contentType = null; String url = null; +URLNormalizers normalizers = null; boolean dumpText = false; boolean followRedirects = false; -String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] "; +String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] "; if (args.length == 0) { System.err.println(usage); @@ -83,7 +84,9 @@ public class IndexingFiltersChecker exte HashMap<String, String> metadata = new HashMap<String, String>(); for (int i = 0; i < args.length; i++) { - if (args[i].equals("-followRedirects")) { + if (args[i].equals("-normalize")) { +normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); + } else if (args[i].equals("-followRedirects")) { followRedirects = true; } else if (args[i].equals("-dumpText")) { dumpText = true; @@ -101,9 +104,13 @@ public class IndexingFiltersChecker exte System.err.println(usage); System.exit(-1); } else { -url = URLUtil.toASCII(args[i]); +url =args[i]; } } + +if (normalizers != null) { + url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); +} LOG.info("fetching: " + url); @@ -129,6 +136,11 @@ public class IndexingFiltersChecker exte while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) { String[] stuff = output.getStatus().getArgs(); url = stuff[0]; + + if (normalizers != null) { +url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } + turl.set(url); // try again
svn commit: r1724409 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Author: markus Date: Wed Jan 13 12:17:03 2016 New Revision: 1724409 URL: http://svn.apache.org/viewvc?rev=1724409=rev Log: NUTCH-2195 IndexingFilterChecker to optionally follow N redirects Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724409=1724408=1724409=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 13 12:17:03 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus) + * NUTCH-2190 Protocol normalizer (markus) * NUTCH-1838 Host and domain based regex and automaton filtering (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724409=1724408=1724409=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 12:17:03 2016 @@ -70,8 +70,9 @@ public class IndexingFiltersChecker exte String contentType = null; String url = null; boolean dumpText = false; +boolean followRedirects = false; -String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value] "; +String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] "; if (args.length == 0) { System.err.println(usage); @@ -82,7 +83,9 @@ public class IndexingFiltersChecker exte HashMap<String, String> metadata = new HashMap<String, String>(); for (int i = 0; i < args.length; i++) { - if (args[i].equals("-dumpText")) { + if (args[i].equals("-followRedirects")) { +followRedirects = true; + } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (args[i].equals("-md")) { String k = null, v = null; @@ -116,11 +119,22 @@ public class IndexingFiltersChecker exte } IndexingFilters indexers = new IndexingFilters(getConf()); + +int maxRedirects = 3; -ProtocolFactory factory = new ProtocolFactory(getConf()); -Protocol protocol = factory.getProtocol(url); +ProtocolOutput output = getProtocolOutput(url, datum); Text turl = new Text(url); -ProtocolOutput output = protocol.getProtocolOutput(turl, datum); + +// Following redirects and not reached maxRedirects? +while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) { + String[] stuff = output.getStatus().getArgs(); + url = stuff[0]; + turl.set(url); + + // try again + output = getProtocolOutput(url, datum); + maxRedirects--; +} if (!output.getStatus().isSuccess()) { System.out.println("Fetch failed with protocol status: " @@ -224,6 +238,14 @@ public class IndexingFiltersChecker exte return 0; } + + protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception { +ProtocolFactory factory = new ProtocolFactory(getConf()); +Protocol protocol = factory.getProtocol(url); +Text turl = new Text(url); +ProtocolOutput output = protocol.getProtocolOutput(turl, datum); +return output; + } public static void main(String[] args) throws Exception { final int res = ToolRunner.run(NutchConfiguration.create(),
svn commit: r1724199 - /nutch/trunk/conf/protocols.txt
Author: markus Date: Tue Jan 12 10:33:59 2016 New Revision: 1724199 URL: http://svn.apache.org/viewvc?rev=1724199=rev Log: NUTCH-2190 Protocol normalizer Added: nutch/trunk/conf/protocols.txt Added: nutch/trunk/conf/protocols.txt URL: http://svn.apache.org/viewvc/nutch/trunk/conf/protocols.txt?rev=1724199=auto == --- nutch/trunk/conf/protocols.txt (added) +++ nutch/trunk/conf/protocols.txt Tue Jan 12 10:33:59 2016 @@ -0,0 +1,7 @@ +# Example configuration file for urlnormalizer-protocol +# +# URL's of hosts listed in the configuration are normalized to the target +# protocol. Useful in cases where a host accepts both http and https, doubling +# the site's size. +# +# format: \t\n
svn commit: r1724085 - in /nutch/trunk: ./ src/plugin/ src/plugin/urlnormalizer-protocol/ src/plugin/urlnormalizer-protocol/data/ src/plugin/urlnormalizer-protocol/src/ src/plugin/urlnormalizer-protoc
Author: markus Date: Mon Jan 11 17:10:30 2016 New Revision: 1724085 URL: http://svn.apache.org/viewvc?rev=1724085=rev Log: NUTCH-2190 Protocol normalizer Added: nutch/trunk/src/plugin/urlnormalizer-protocol/ nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml nutch/trunk/src/plugin/urlnormalizer-protocol/data/ nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt nutch/trunk/src/plugin/urlnormalizer-protocol/ivy.xml nutch/trunk/src/plugin/urlnormalizer-protocol/plugin.xml nutch/trunk/src/plugin/urlnormalizer-protocol/src/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml nutch/trunk/default.properties nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724085=1724084=1724085=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Jan 11 17:10:30 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2190 Protocol normalizer (markus) + * NUTCH-1838 Host and domain based regex and automaton filtering (markus) * NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1724085=1724084=1724085=diff == --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Mon Jan 11 17:10:30 2016 @@ -224,6 +224,7 @@ + @@ -660,6 +661,7 @@ + @@ -1082,6 +1084,8 @@ + + Modified: nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1724085=1724084=1724085=diff == --- nutch/trunk/default.properties (original) +++ nutch/trunk/default.properties Mon Jan 11 17:10:30 2016 @@ -110,6 +110,7 @@ plugins.urlnormalizer=\ org.apache.nutch.net.urlnormalizer.basic*:\ org.apache.nutch.net.urlnormalizer.host*:\ org.apache.nutch.net.urlnormalizer.pass*:\ + org.apache.nutch.net.urlnormalizer.protocol*:\ org.apache.nutch.net.urlnormalizer.querystring*:\ org.apache.nutch.net.urlnormalizer.regex* Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1724085=1724084=1724085=diff == --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Mon Jan 11 17:10:30 2016 @@ -82,6 +82,7 @@ + @@ -125,6 +126,7 @@ + @@ -193,6 +195,7 @@ + Added: nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml?rev=1724085=auto == --- nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml Mon Jan 11 17:10:30 2016 @@ -0,0 +1,27 @@ + + + + + + + + + + + + Added: nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt?rev
svn commit: r1723688 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: markus Date: Fri Jan 8 11:10:38 2016 New Revision: 1723688 URL: http://svn.apache.org/viewvc?rev=1723688=rev Log: NUTCH-1449 Optionally delete documents skipped by IndexingFilters Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723688=1723687=1723688=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 8 11:10:38 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus) + * NUTCH-2189 Domain filter must deactivate if no rules are present (markus) * NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency (joyce) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1723688=1723687=1723688=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Fri Jan 8 11:10:38 2016 @@ -1043,6 +1043,20 @@ + + indexer.delete.robots.noindex + false + Whether the indexer will delete documents marked by robots=noindex + + + + + indexer.delete.skipped.by.indexingfilter + false + Whether the indexer will delete documents that were skipped by indexing filters + + + Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1723688=1723687=1723688=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jan 8 11:10:38 2016 @@ -63,6 +63,7 @@ public class IndexerMapReduce extends Co public static final String INDEXER_PARAMS = "indexer.additional.params"; public static final String INDEXER_DELETE = "indexer.delete"; public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex"; + public static final String INDEXER_DELETE_SKIPPED = "indexer.delete.skipped.by.indexingfilter"; public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified"; public static final String URL_FILTERING = "indexer.url.filters"; public static final String URL_NORMALIZING = "indexer.url.normalizers"; @@ -71,6 +72,7 @@ public class IndexerMapReduce extends Co private boolean skip = false; private boolean delete = false; private boolean deleteRobotsNoIndex = false; + private boolean deleteSkippedByIndexingFilter = false; private boolean base64 = false; private IndexingFilters filters; private ScoringFilters scfilters; @@ -94,6 +96,8 @@ public class IndexerMapReduce extends Co this.delete = job.getBoolean(INDEXER_DELETE, false); this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false); +this.deleteSkippedByIndexingFilter = job.getBoolean(INDEXER_DELETE_SKIPPED, +false); this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false); this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false); @@ -245,7 +249,7 @@ public class IndexerMapReduce extends Co || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { -reporter.incrCounter("IndexerStatus", "deleted redirects", 1); +reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1); output.collect(key, DELETE_ACTION); return; } @@ -258,7 +262,7 @@ public class IndexerMapReduce extends Co // Whether to delete pages marked as duplicates if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { - reporter.incrCounter("IndexerStatus", "deleted duplicates", 1); + reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1); output.collect(key, DELETE_ACTION); return; } @@ -284,8 +288,25 @@ public class IndexerMapReduce extends Co // add digest, used by dedup doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY)); - + final Parse parse = new ParseImpl(parseText, parseData); +float boost = 1.0f; +// run scoring filters +try { + boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, + inlinks, boost); +} catch (final ScoringFilterException e) { + reporter.incrCoun
svn commit: r1723690 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java
Author: markus Date: Fri Jan 8 11:14:33 2016 New Revision: 1723690 URL: http://svn.apache.org/viewvc?rev=1723690=rev Log: NUTCH-2178 DeduplicationJob to optionally group on host or domain Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723690=1723689=1723690=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 8 11:14:33 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus) + * NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus) * NUTCH-2189 Domain filter must deactivate if no rules are present (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1723690=1723689=1723690=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Fri Jan 8 11:14:33 2016 @@ -49,6 +49,7 @@ import org.apache.nutch.util.NutchConfig import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.TimingUtil; +import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -67,12 +68,16 @@ public class DeduplicationJob extends Nu .getLogger(DeduplicationJob.class); private final static Text urlKey = new Text("_URLTEMPKEY_"); + private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; public static class DBFilter implements Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> { + +private String groupMode; @Override public void configure(JobConf arg0) { + groupMode = arg0.get(DEDUPLICATION_GROUP_MODE); } @Override @@ -90,10 +95,31 @@ public class DeduplicationJob extends Nu byte[] signature = value.getSignature(); if (signature == null) return; -BytesWritable sig = new BytesWritable(signature); +String url = key.toString(); +BytesWritable sig = null; +byte[] data; +switch (groupMode) { + case "none": +sig = new BytesWritable(signature); +break; + case "host": +byte[] host = URLUtil.getHost(url).getBytes(); +data = new byte[signature.length + host.length]; +System.arraycopy(signature, 0, data, 0, signature.length); +System.arraycopy(host, 0, data, signature.length, host.length); +sig = new BytesWritable(data); +break; + case "domain": +byte[] domain = URLUtil.getDomainName(url).getBytes(); +data = new byte[signature.length + domain.length]; +System.arraycopy(signature, 0, data, 0, signature.length); +System.arraycopy(domain, 0, data, signature.length, domain.length); +sig = new BytesWritable(data); +break; +} // add the URL as a temporary MD value.getMetaData().put(urlKey, key); -// reduce on the signature +// reduce on the signature optionall grouped on host or domain or not at all output.collect(sig, value); } } @@ -216,11 +242,17 @@ public class DeduplicationJob extends Nu public int run(String[] args) throws IOException { if (args.length < 1) { - System.err.println("Usage: DeduplicationJob "); + System.err.println("Usage: DeduplicationJob [-group <none|host|domain>]"); return 1; } +String group = "none"; String crawldb = args[0]; + +for (int i = 1; i < args.length; i++) { + if (args[i].equals("-group")) +group = args[++i]; +} SimpleDateFormat sdf = new SimpleDateFormat("-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -233,6 +265,7 @@ public class DeduplicationJob extends Nu JobConf job = new NutchJob(getConf()); job.setJobName("Deduplication on " + crawldb); +job.set(DEDUPLICATION_GROUP_MODE, group); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class);
svn commit: r1723710 - in /nutch/trunk: ./ src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/ src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/ src/plugin
Author: markus Date: Fri Jan 8 12:11:18 2016 New Revision: 1723710 URL: http://svn.apache.org/viewvc?rev=1723710=rev Log: NUTCH-1838 Host and domain based regex and automaton filtering Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723710=1723709=1723710=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 8 12:11:18 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-1838 Host and domain based regex and automaton filtering (markus) + * NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus) * NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus) Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1723710=1723709=1723710=diff == --- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Fri Jan 8 12:11:18 2016 @@ -24,6 +24,10 @@ package org.apache.nutch.urlfilter.api; public abstract class RegexRule { private final boolean sign; + + private final String hostOrDomain; + + private final String regex; /** * Constructs a new regular expression rule. @@ -38,7 +42,27 @@ public abstract class RegexRule { * {@link #match(String)} method). */ protected RegexRule(boolean sign, String regex) { +this(sign, regex, null); + } + + /** + * Constructs a new regular expression rule. + * + * @param sign + * specifies if this rule must filter-in or filter-out. A + * true value means that any url matching this rule must + * be accepted, a false value means that any url + * matching this rule must be rejected. + * @param regex + * is the regular expression used for matching (see + * {@link #match(String)} method). + * @param hostOrDomain + * the host or domain to which this regex belongs + */ + protected RegexRule(boolean sign, String regex, String hostOrDomain) { this.sign = sign; +this.hostOrDomain = hostOrDomain; +this.regex = regex; } /** @@ -52,6 +76,20 @@ public abstract class RegexRule { } /** + * Return if this rule is used for filtering-in or out. + * + * @return host or domain this regex rule belongs to + */ + protected String hostOrDomain() { return hostOrDomain; } + + /** + * Return if this rule's regex. + * + * @return this regex + */ + protected String regex() { return regex; } + + /** * Checks if a url matches this rule. * * @param url Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1723710=1723709=1723710=diff == --- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Fri Jan 8 12:11:18 2016 @@ -24,6 +24,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.io.StringReader; +import java.net.MalformedURLException; import java.util.List; import java.util.ArrayList; @@ -36,6 +37,7 @@ import org.apache.hadoop.conf.Configurat // Nutch imports import org.apache.nutch.net.*; +import org.apache.nutch.util.URLUtil; /** * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular @@ -123,6 +125,20 @@ public abstract class RegexURLFilterBase * is the regular expression associated to this rule. */ protected abstract RegexRule createRule
svn commit: r1721615 - in /nutch/trunk: CHANGES.txt src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java src/plugin/urlfilter-domain/src/test/org/apache/nutch/ur
Author: markus Date: Thu Dec 24 12:45:27 2015 New Revision: 1721615 URL: http://svn.apache.org/viewvc?rev=1721615=rev Log: NUTCH-2189 Domain filter must deactivate if no rules are present Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1721615=1721614=1721615=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Dec 24 12:45:27 2015 @@ -1,6 +1,8 @@ Nutch Change Log -* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency +* NUTCH-2189 Domain filter must deactivate if no rules are present (markus) + +* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency (joyce) * NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory (lewismc) Modified: nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1721615=1721614=1721615=diff == --- nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Thu Dec 24 12:45:27 2015 @@ -180,9 +180,10 @@ public class DomainURLFilter implements } public String filter(String url) { - +// https://issues.apache.org/jira/browse/NUTCH-2189 +if (domainSet.size() == 0) return url; + try { - // match for suffix, domain, and host in that order. more general will // override more specific String domain = URLUtil.getDomainName(url).toLowerCase().trim(); Modified: nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1721615=1721614=1721615=diff == --- nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Thu Dec 24 12:45:27 2015 @@ -44,5 +44,24 @@ public class TestDomainURLFilter { Assert.assertNotNull(domainFilter.filter("http://www.foobar.be;)); Assert.assertNull(domainFilter.filter("http://www.adobe.com;)); } + + @Test + public void testNoFilter() throws Exception { +// https://issues.apache.org/jira/browse/NUTCH-2189 +String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt"; +Configuration conf = NutchConfiguration.create(); +DomainURLFilter domainFilter = new DomainURLFilter(domainFile); +domainFilter.setConf(conf); +Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org;)); +Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org;)); +Assert.assertNotNull(domainFilter.filter("http://www.apache.org;)); +Assert.assertNotNull(domainFilter.filter("http://www.google.com;)); +Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com;)); +Assert.assertNotNull(domainFilter.filter("http://www.foobar.net;)); +Assert.assertNotNull(domainFilter.filter("http://www.foobas.net;)); +Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com;)); +Assert.assertNotNull(domainFilter.filter("http://www.foobar.be;)); +Assert.assertNotNull(domainFilter.filter("http://www.adobe.com;)); + } }
svn commit: r1717622 - in /nutch/trunk: CHANGES.txt conf/log4j.properties
Author: markus Date: Wed Dec 2 12:40:27 2015 New Revision: 1717622 URL: http://svn.apache.org/viewvc?rev=1717622=rev Log: NUTCH-2176 Clean up of log4j.properties Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/log4j.properties Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1717622=1717621=1717622=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Dec 2 12:40:27 2015 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 1.11 Release 25/10/2015 (dd/mm/) Release Report: http://s.apache.org/nutch11 +* NUTCH-2176 Clean up of log4j.properties (markus) + * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel) * NUTCH-2177 Generator produces only one partition even in distributed mode (jnioche, snagel) Modified: nutch/trunk/conf/log4j.properties URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1717622=1717621=1717622=diff == --- nutch/trunk/conf/log4j.properties (original) +++ nutch/trunk/conf/log4j.properties Wed Dec 2 12:40:27 2015 @@ -24,41 +24,39 @@ log4j.rootLogger=INFO,DRFA log4j.threshold=ALL #special logging requirements for some commandline tools +log4j.logger.org.apache.nutch.crawl.CrawlDb=INFO,cmdstdout +log4j.logger.org.apache.nutch.crawl.CrawlDbMerger=INFO,cmdstdout +log4j.logger.org.apache.nutch.crawl.CrawlDbReader=INFO,cmdstdout log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.Injector=INFO,cmdstdout +log4j.logger.org.apache.nutch.crawl.DeduplicationJob=INFO,cmdstdout log4j.logger.org.apache.nutch.crawl.Generator=INFO,cmdstdout +log4j.logger.org.apache.nutch.crawl.Injector=INFO,cmdstdout +log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout +log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout +log4j.logger.org.apache.nutch.crawl.LinkDbReader=INFO,cmdstdout log4j.logger.org.apache.nutch.fetcher.Fetcher=INFO,cmdstdout -log4j.logger.org.apache.nutch.fetcher.FetcherThread=INFO,cmdstdout log4j.logger.org.apache.nutch.fetcher.FetcherItem=INFO,cmdstdout log4j.logger.org.apache.nutch.fetcher.FetcherItemQueue=INFO,cmdstdout log4j.logger.org.apache.nutch.fetcher.FetcherItemQueues=INFO,cmdstdout +log4j.logger.org.apache.nutch.fetcher.FetcherThread=INFO,cmdstdout log4j.logger.org.apache.nutch.fetcher.QueueFeeder=INFO,cmdstdout -log4j.logger.org.apache.nutch.parse.ParseSegment=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.CrawlDbReader=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.CrawlDbMerger=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.LinkDbReader=INFO,cmdstdout -log4j.logger.org.apache.nutch.segment.SegmentChecker=INFO,cmdstdout -log4j.logger.org.apache.nutch.segment.SegmentReader=INFO,cmdstdout -log4j.logger.org.apache.nutch.segment.SegmentMerger=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.CrawlDb=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout +log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout log4j.logger.org.apache.nutch.indexer.IndexingJob=INFO,cmdstdout log4j.logger.org.apache.nutch.indexwriter.solr.SolrIndexWriter=INFO,cmdstdout log4j.logger.org.apache.nutch.indexwriter.solr.SolrUtils-INFO,cmdstdout -log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout +log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout +log4j.logger.org.apache.nutch.parse.ParseSegment=INFO,cmdstdout +log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN +log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout log4j.logger.org.apache.nutch.scoring.webgraph.ScoreUpdater=INFO,cmdstdout -log4j.logger.org.apache.nutch.util.hostdb.HostDb=INFO,cmdstdout -log4j.logger.org.apache.nutch.util.hostdb.DumpHostDb=INFO,cmdstdout -log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout +log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout +log4j.logger.org.apache.nutch.segment.SegmentChecker=INFO,cmdstdout +log4j.logger.org.apache.nutch.segment.SegmentMerger=INFO,cmdstdout +log4j.logger.org.apache.nutch.segment.SegmentReader=INFO,cmdstdout log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout -log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout -log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout -log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN log4j.logger.org.apache.nutch=INFO log4j.logger.org.apache.hadoop=WARN
svn commit: r1703111 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: markus Date: Tue Sep 15 06:51:48 2015 New Revision: 1703111 URL: http://svn.apache.org/r1703111 Log: NUTCH-2093 Indexing filters to use current signatures Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1703111=1703110=1703111=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Sep 15 06:51:48 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2093 Indexing filters to use current signatures (markus) + * NUTCH-2092: Unit Test for NutchServer (Sujen Shah via mattmann) * NUTCH-2096 Explicitly indicate broswer binary to use when selecting Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1703111=1703110=1703111=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Sep 15 06:51:48 2015 @@ -287,6 +287,9 @@ public class IndexerMapReduce extends Co final Parse parse = new ParseImpl(parseText, parseData); try { + // Indexing filters may also be interested in the signature + fetchDatum.setSignature(dbDatum.getSignature()); + // extract information from dbDatum and pass it to // fetchDatum so that indexing filters can use it final Text url = (Text) dbDatum.getMetaData().get(
svn commit: r1688566 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentReader.java
Author: markus Date: Wed Jul 1 07:00:40 2015 New Revision: 1688566 URL: http://svn.apache.org/r1688566 Log: NUTCH-1692 SegmentReader was broken in distributed mode Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688566r1=1688565r2=1688566view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jul 1 07:00:40 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-1692 SegmentReader was broken in distributed mode (markus, tejasp) + * NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus) * NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks) Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1688566r1=1688565r2=1688566view=diff == --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed Jul 1 07:00:40 2015 @@ -507,55 +507,64 @@ public class SegmentReader extends Confi public void getStats(Path segment, final SegmentReaderStats stats) throws Exception { -SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders( -getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); long cnt = 0L; Text key = new Text(); -for (int i = 0; i readers.length; i++) { - while (readers[i].next(key)) -cnt++; - readers[i].close(); -} -stats.generated = cnt; -Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); -if (fs.exists(fetchDir) fs.getFileStatus(fetchDir).isDir()) { - cnt = 0L; - long start = Long.MAX_VALUE; - long end = Long.MIN_VALUE; - CrawlDatum value = new CrawlDatum(); - MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, - getConf()); - for (int i = 0; i mreaders.length; i++) { -while (mreaders[i].next(key, value)) { + +if (ge) { + SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders( + getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); + for (int i = 0; i readers.length; i++) { +while (readers[i].next(key)) cnt++; - if (value.getFetchTime() start) -start = value.getFetchTime(); - if (value.getFetchTime() end) -end = value.getFetchTime(); +readers[i].close(); + } + stats.generated = cnt; +} + +if (fe) { + Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); + if (fs.exists(fetchDir) fs.getFileStatus(fetchDir).isDir()) { +cnt = 0L; +long start = Long.MAX_VALUE; +long end = Long.MIN_VALUE; +CrawlDatum value = new CrawlDatum(); +MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, +getConf()); +for (int i = 0; i mreaders.length; i++) { + while (mreaders[i].next(key, value)) { +cnt++; +if (value.getFetchTime() start) + start = value.getFetchTime(); +if (value.getFetchTime() end) + end = value.getFetchTime(); + } + mreaders[i].close(); } -mreaders[i].close(); +stats.start = start; +stats.end = end; +stats.fetched = cnt; } - stats.start = start; - stats.end = end; - stats.fetched = cnt; } -Path parseDir = new Path(segment, ParseData.DIR_NAME); -if (fs.exists(parseDir) fs.getFileStatus(parseDir).isDir()) { - cnt = 0L; - long errors = 0L; - ParseData value = new ParseData(); - MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, - getConf()); - for (int i = 0; i mreaders.length; i++) { -while (mreaders[i].next(key, value)) { - cnt++; - if (!value.getStatus().isSuccess()) -errors++; + +if (pd) { + Path parseDir = new Path(segment, ParseData.DIR_NAME); + if (fs.exists(parseDir) fs.getFileStatus(parseDir).isDir()) { +cnt = 0L; +long errors = 0L; +ParseData value = new ParseData(); +MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, +getConf()); +for (int i = 0; i mreaders.length; i++) { + while (mreaders[i].next(key, value)) { +cnt++; +if (!value.getStatus().isSuccess()) + errors++; + } + mreaders[i].close(); } -mreaders[i
svn commit: r1688561 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: markus Date: Wed Jul 1 06:56:32 2015 New Revision: 1688561 URL: http://svn.apache.org/r1688561 Log: NUTCH-1684 ParseMeta to be added before fetch schedulers are run Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688561r1=1688560r2=1688561view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jul 1 06:56:32 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus) + * NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks) (Asitang Mishra, snagel via mattmann) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1688561r1=1688560r2=1688561view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Jul 1 06:56:32 2015 @@ -209,6 +209,13 @@ public class CrawlDbReducer implements case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected case CrawlDatum.STATUS_FETCH_REDIR_PERM: case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified + // https://issues.apache.org/jira/browse/NUTCH-1656 + if (metaFromParse != null) { +for (EntryWritable, Writable e : metaFromParse.entrySet()) { + result.getMetaData().put(e.getKey(), e.getValue()); +} + } + // determine the modification status int modified = FetchSchedule.STATUS_UNKNOWN; if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) { @@ -260,13 +267,6 @@ public class CrawlDbReducer implements result.setSignature(signature); } - // https://issues.apache.org/jira/browse/NUTCH-1656 - if (metaFromParse != null) { -for (EntryWritable, Writable e : metaFromParse.entrySet()) { - result.getMetaData().put(e.getKey(), e.getValue()); -} - } - // if fetchInterval is larger than the system-wide maximum, trigger // an unconditional recrawl. This prevents the page to be stuck at // NOTMODIFIED state, when the old fetched copy was already removed with
svn commit: r1675058 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentMerger.java
Author: markus Date: Tue Apr 21 07:43:32 2015 New Revision: 1675058 URL: http://svn.apache.org/r1675058 Log: NUTCH-1697 SegmentMerger to implement Tool Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1675058r1=1675057r2=1675058view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Apr 21 07:43:32 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1697 SegmentMerger to implement Tool (markus, snagel) + * NUTCH-1987 - Make bin/crawl indexer agnostic (Michael Joyce, snagel via mattmann) * NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel) Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1675058r1=1675057r2=1675058view=diff == --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Apr 21 07:43:32 2015 @@ -51,6 +51,8 @@ import org.apache.hadoop.mapred.Sequence import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileRecordReader; import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Generator; import org.apache.nutch.metadata.MetaWrapper; @@ -118,7 +120,7 @@ import org.apache.nutch.util.NutchJob; * * @author Andrzej Bialecki */ -public class SegmentMerger extends Configured implements +public class SegmentMerger extends Configured implements Tool, MapperText, MetaWrapper, Text, MetaWrapper, ReducerText, MetaWrapper, Text, MetaWrapper { private static final Logger LOG = LoggerFactory @@ -691,7 +693,7 @@ public class SegmentMerger extends Confi /** * @param args */ - public static void main(String[] args) throws Exception { + public int run(String[] args) throws Exception { if (args.length 2) { System.err .println(SegmentMerger output_dir (-dir segments | seg1 seg2 ...) [-filter] [-slice ]); @@ -706,7 +708,7 @@ public class SegmentMerger extends Confi .println(\t-normalize\t\tnormalize URL via current URLNormalizers); System.err .println(\t-slice \tcreate many output segments, each containing URLs); - return; + return -1; } Configuration conf = NutchConfiguration.create(); final FileSystem fs = FileSystem.get(conf); @@ -734,11 +736,18 @@ public class SegmentMerger extends Confi } if (segs.size() == 0) { System.err.println(ERROR: No input segments.); - return; + return -1; } -SegmentMerger merger = new SegmentMerger(conf); -merger.merge(out, segs.toArray(new Path[segs.size()]), filter, normalize, + +merge(out, segs.toArray(new Path[segs.size()]), filter, normalize, sliceSize); +return 0; + } + + public static void main(String[] args) throws Exception { +int result = ToolRunner.run(NutchConfiguration.create(), +new SegmentMerger(), args); +System.exit(result); } }
svn commit: r1666471 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/NutchWritable.java
Author: markus Date: Fri Mar 13 14:58:05 2015 New Revision: 1666471 URL: http://svn.apache.org/r1666471 Log: NUTCH-1955 ByteWritable missing in NutchWritable Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1666471r1=1666470r2=1666471view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Mar 13 14:58:05 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1955 ByteWritable missing in NutchWritable (markus) + * NUTCH-1956 Members to be public in URLCrawlDatum (markus) * NUTCH-1954 FilenameTooLong error appears in CommonCrawlDumper (mattmann) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1666471r1=1666470r2=1666471view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Fri Mar 13 14:58:05 2015 @@ -29,6 +29,7 @@ public class NutchWritable extends Gener org.apache.hadoop.io.NullWritable.class, org.apache.hadoop.io.BooleanWritable.class, org.apache.hadoop.io.LongWritable.class, +org.apache.hadoop.io.ByteWritable.class, org.apache.hadoop.io.BytesWritable.class, org.apache.hadoop.io.FloatWritable.class, org.apache.hadoop.io.IntWritable.class,
svn commit: r1663698 - in /nutch/trunk: ./ conf/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-
Author: markus Date: Tue Mar 3 13:16:39 2015 New Revision: 1663698 URL: http://svn.apache.org/r1663698 Log: NUTCH 1921 Optionally disable HTTP if-modified-since header Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1663698r1=1663697r2=1663698view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Mar 3 13:16:39 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1921 Optionally disable HTTP if-modified-since header (markus) + * NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc) * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, lewismc) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1663698r1=1663697r2=1663698view=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Mar 3 13:16:39 2015 @@ -297,6 +297,17 @@ /description /property +property + namehttp.enable.if.modified.since.header/name + valuetrue/value + descriptionWhether Nutch sends an HTTP If-Modified-Since header. It reduces + bandwidth when enabled by not downloading pages that respond with an HTTP + Not-Modified header. URL's that are not downloaded are not passed through + parse or indexing filters. If you regularly modify filters, you should force + Nutch to also download unmodified pages by disabling this feature. + /description +/property + !-- FTP properties -- property Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1663698r1=1663697r2=1663698view=diff == --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue Mar 3 13:16:39 2015 @@ -107,6 +107,9 @@ public abstract class HttpBase implement /** Which TLS/SSL cipher suites to support */ protected SetString tlsPreferredCipherSuites; + + /** Configuration directive for If-Modified-Since HTTP header */ + public boolean enableIfModifiedsinceHeader = true; /** Creates a new instance of HttpBase */ public HttpBase() { @@ -137,6 +140,7 @@ public abstract class HttpBase implement // backward-compatible default setting this.useHttp11 = conf.getBoolean(http.useHttp11, false); this.responseTime = conf.getBoolean(http.store.responsetime, true); +this.enableIfModifiedsinceHeader = conf.getBoolean(http.enable.if.modified.since.header, true); this.robots.setConf(conf); String[] protocols = conf.getStrings(http.tls.supported.protocols, @@ -298,6 +302,10 @@ public abstract class HttpBase implement public int getTimeout() { return timeout; } + + public boolean isIfModifiedSinceEnabled() { +return enableIfModifiedsinceHeader; + } public int getMaxContent() { return maxContent; Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1663698r1=1663697r2=1663698view=diff == --- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Mar 3 13:16:39 2015 @@ -192,7 +192,7 @@ public class HttpResponse implements Res reqStr.append(this.http.getAccept()); reqStr.append(\r\n); - if (datum.getModifiedTime() 0) { + if (http.isIfModifiedSinceEnabled() datum.getModifiedTime() 0) { reqStr.append(If-Modified-Since: + HttpDateFormat.toString(datum.getModifiedTime())); reqStr.append(\r\n); Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient
svn commit: r1659532 - in /nutch/branches/2.x: CHANGES.txt ivy/ivy.xml src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml
Author: markus Date: Fri Feb 13 12:25:13 2015 New Revision: 1659532 URL: http://svn.apache.org/r1659532 Log: NUTCH-1925 Upgrade to Apache Tika 1.7 Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/ivy/ivy.xml nutch/branches/2.x/src/plugin/parse-tika/ivy.xml nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1659532r1=1659531r2=1659532view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Feb 13 12:25:13 2015 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.4-SNAPSHOT +* NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus) + * NUTCH-1924 Nutch + HBase Docker (RadosÅaw Stankiewicz via lewismc) * NUTCH-1920 Upgrade Nutch to use Java 1.7 (lewismc) Modified: nutch/branches/2.x/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1659532r1=1659531r2=1659532view=diff == --- nutch/branches/2.x/ivy/ivy.xml (original) +++ nutch/branches/2.x/ivy/ivy.xml Fri Feb 13 12:25:13 2015 @@ -55,7 +55,7 @@ /dependency dependency org=com.ibm.icu name=icu4j rev=4.0.1 / -dependency org=org.apache.tika name=tika-core rev=1.6 / +dependency org=org.apache.tika name=tika-core rev=1.7 / dependency org=com.googlecode.juniversalchardet name=juniversalchardet rev=1.0.3/ dependency org=log4j name=log4j rev=1.2.15 conf=*-master / Modified: nutch/branches/2.x/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/ivy.xml?rev=1659532r1=1659531r2=1659532view=diff == --- nutch/branches/2.x/src/plugin/parse-tika/ivy.xml (original) +++ nutch/branches/2.x/src/plugin/parse-tika/ivy.xml Fri Feb 13 12:25:13 2015 @@ -36,7 +36,7 @@ /publications dependencies -dependency org=org.apache.tika name=tika-parsers rev=1.6 conf=*-default +dependency org=org.apache.tika name=tika-parsers rev=1.7 conf=*-default exclude org=org.apache.tika name=tika-core / /dependency override module=rome rev=0.9/ Modified: nutch/branches/2.x/src/plugin/parse-tika/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/plugin.xml?rev=1659532r1=1659531r2=1659532view=diff == --- nutch/branches/2.x/src/plugin/parse-tika/plugin.xml (original) +++ nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Fri Feb 13 12:25:13 2015 @@ -38,27 +38,27 @@ library name=commons-httpclient-3.1.jar/ library name=commons-logging-1.1.1.jar/ library name=dom4j-1.6.1.jar/ - library name=fontbox-1.8.6.jar/ + library name=fontbox-1.8.8.jar/ library name=geronimo-stax-api_1.0_spec-1.0.1.jar/ library name=isoparser-1.0.2.jar/ library name=java-libpst-0.8.1.jar/ library name=jcip-annotations-1.0.jar/ library name=jdom-1.0.jar/ - library name=jempbox-1.8.6.jar/ + library name=jempbox-1.8.8.jar/ library name=jhighlight-1.0.jar/ library name=jmatio-1.0.jar/ library name=juniversalchardet-1.0.3.jar/ library name=metadata-extractor-2.6.2.jar/ library name=netcdf-4.2.20.jar/ - library name=pdfbox-1.8.6.jar/ - library name=poi-3.11-beta2.jar/ - library name=poi-ooxml-3.11-beta2.jar/ - library name=poi-ooxml-schemas-3.11-beta2.jar/ - library name=poi-scratchpad-3.11-beta2.jar/ + library name=pdfbox-1.8.8.jar/ + library name=poi-3.11.jar/ + library name=poi-ooxml-3.11.jar/ + library name=poi-ooxml-schemas-3.11.jar/ + library name=poi-scratchpad-3.11.jar/ library name=rome-0.9.jar/ library name=slf4j-api-1.6.1.jar/ library name=tagsoup-1.2.1.jar/ - library name=tika-parsers-1.6.jar/ + library name=tika-parsers-1.7.jar/ library name=unidataCommon-4.2.20.jar/ library name=vorbis-java-core-0.6.jar/ library name=vorbis-java-tika-0.6.jar/
svn commit: r1659533 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/LinkDbReader.java
Author: markus Date: Fri Feb 13 12:28:13 2015 New Revision: 1659533 URL: http://svn.apache.org/r1659533 Log: NUTCH-1724 LinkDBReader to support regex output filtering Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659533r1=1659532r2=1659533view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Feb 13 12:28:13 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1724 LinkDBReader to support regex output filtering (markus) + * NUTCH-1939 Fetcher fails to follow redirects (Leo Ye via snagel) * NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1659533r1=1659532r2=1659533view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Fri Feb 13 12:28:13 2015 @@ -19,6 +19,9 @@ package org.apache.nutch.crawl; import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + // Commons Logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,7 +46,7 @@ import java.io.Closeable; public class LinkDbReader extends Configured implements Tool, Closeable { public static final Logger LOG = LoggerFactory.getLogger(LinkDbReader.class); - private static final PartitionerWritableComparable?, Writable PARTITIONER = new HashPartitionerWritableComparable?, Writable(); + private static final PartitionerWritableComparable, Writable PARTITIONER = new HashPartitionerWritableComparable, Writable(); private FileSystem fs; private Path directory; @@ -90,8 +93,33 @@ public class LinkDbReader extends Config } } } + + public static class LinkDBDumpMapper implements MapperText, Inlinks, Text, Inlinks { +Pattern pattern = null; +Matcher matcher = null; + +public void configure(JobConf job) { + if (job.get(linkdb.regex, null) != null) { +pattern = Pattern.compile(job.get(linkdb.regex)); + } +} + +public void close() {} +public void map(Text key, Inlinks value, OutputCollectorText, Inlinks output, Reporter reporter) +throws IOException { + + if (pattern != null) { +matcher = pattern.matcher(key.toString()); +if (!matcher.matches()) { + return; +} + } + + output.collect(key, value); +} + } - public void processDumpJob(String linkdb, String output) throws IOException { + public void processDumpJob(String linkdb, String output, String regex) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat(-MM-dd HH:mm:ss); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { @@ -102,6 +130,11 @@ public class LinkDbReader extends Config JobConf job = new NutchJob(getConf()); job.setJobName(read + linkdb); + +if (regex != null) { + job.set(linkdb.regex, regex); + job.setMapperClass(LinkDBDumpMapper.class); +} FileInputFormat.addInputPath(job, new Path(linkdb, LinkDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); @@ -127,16 +160,24 @@ public class LinkDbReader extends Config public int run(String[] args) throws Exception { if (args.length 2) { System.err - .println(Usage: LinkDbReader linkdb (-dump out_dir | -url url)); + .println(Usage: LinkDbReader linkdb (-dump out_dir [-regex regex]) | -url url); System.err .println(\t-dump out_dir\tdump whole link db to a text file in out_dir); System.err + .println(\t\t-regex regex\trestrict to url's matching expression); + System.err .println(\t-url url\tprint information about url to System.out); return -1; } try { if (args[1].equals(-dump)) { -processDumpJob(args[0], args[2]); +String regex = null; +for (int i = 2; i args.length; i++) { + if (args[i].equals(-regex)) { +regex = args[++i]; + } +} +processDumpJob(args[0], args[2], regex); return 0; } else if (args[1].equals(-url)) { init(new Path(args[0]));
svn commit: r1659169 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/LinkDb.java
Author: markus Date: Thu Feb 12 08:42:49 2015 New Revision: 1659169 URL: http://svn.apache.org/r1659169 Log: NUTCH-1913 LinkDB to implement db.ignore.external.links Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659169r1=1659168r2=1659169view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Feb 12 08:42:49 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel) + * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus) * NUTCH-1323 AjaxNormalizer (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1659169r1=1659168r2=1659169view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Feb 12 08:42:49 2015 @@ -49,12 +49,14 @@ public class LinkDb extends Configured i public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class); public static final String IGNORE_INTERNAL_LINKS = db.ignore.internal.links; + public static final String IGNORE_EXTERNAL_LINKS = db.ignore.external.links; public static final String CURRENT_NAME = current; public static final String LOCK_NAME = .locked; private int maxAnchorLength; private boolean ignoreInternalLinks; + private boolean ignoreExternalLinks; private URLFilters urlFilters; private URLNormalizers urlNormalizers; @@ -68,6 +70,8 @@ public class LinkDb extends Configured i public void configure(JobConf job) { maxAnchorLength = job.getInt(db.max.anchor.length, 100); ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true); +ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false); + if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) { urlFilters = new URLFilters(job); } @@ -115,6 +119,11 @@ public class LinkDb extends Configured i if (toHost == null || toHost.equals(fromHost)) { // internal link continue; // skip it } + } else if (ignoreExternalLinks) { +String toHost = getHost(toUrl); +if (toHost == null || !toHost.equals(fromHost)) { // external link + continue; // skip it +} } if (urlNormalizers != null) { try { @@ -180,6 +189,15 @@ public class LinkDb extends Configured i if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) { LOG.info(LinkDb: internal links will be ignored.); } + if (job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) { +LOG.info(LinkDb: external links will be ignored.); + } +} +if (job.getBoolean(IGNORE_INTERNAL_LINKS, true) + job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) { + LOG.warn(LinkDb: internal and external links are ignored! + + Nothing to do, actually. Exiting.); + return; } for (int i = 0; i segments.length; i++) { @@ -291,7 +309,6 @@ public class LinkDb extends Configured i System.err.println(\t-noFilter\tdon't apply URLFilters to link URLs); return -1; } -Path segDir = null; final FileSystem fs = FileSystem.get(getConf()); Path db = new Path(args[0]); ArrayListPath segs = new ArrayListPath();
svn commit: r1659167 - in /nutch/trunk: ./ src/plugin/ src/plugin/urlnormalizer-ajax/ src/plugin/urlnormalizer-ajax/src/ src/plugin/urlnormalizer-ajax/src/java/ src/plugin/urlnormalizer-ajax/src/java/
Author: markus Date: Thu Feb 12 08:30:31 2015 New Revision: 1659167 URL: http://svn.apache.org/r1659167 Log: NUTCH-1323 AjaxNormalizer Added: nutch/trunk/src/plugin/urlnormalizer-ajax/ nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml nutch/trunk/src/plugin/urlnormalizer-ajax/src/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659167r1=1659166r2=1659167view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Feb 12 08:30:31 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1323 AjaxNormalizer (markus) + * NUTCH-1918 TikaParser specifies a default namespace when generating DOM (jnioche) * NUTCH-1889 Store all values from Tika metadata in Nutch metadata (jnioche) Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1659167r1=1659166r2=1659167view=diff == --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Thu Feb 12 08:30:31 2015 @@ -69,6 +69,7 @@ ant dir=urlfilter-suffix target=deploy/ ant dir=urlfilter-validator target=deploy/ ant dir=urlmeta target=deploy/ + ant dir=urlnormalizer-ajax target=deploy/ ant dir=urlnormalizer-basic target=deploy/ ant dir=urlnormalizer-host target=deploy/ ant dir=urlnormalizer-pass target=deploy/ @@ -107,6 +108,7 @@ ant dir=urlfilter-regex target=test/ ant dir=urlfilter-suffix target=test/ ant dir=urlfilter-validator target=test/ + ant dir=urlnormalizer-ajax target=test/ ant dir=urlnormalizer-basic target=test/ ant dir=urlnormalizer-host target=test/ ant dir=urlnormalizer-pass target=test/ @@ -164,8 +166,9 @@ ant dir=urlfilter-suffix target=clean/ ant dir=urlfilter-validator target=clean/ ant dir=urlmeta target=clean/ -ant dir=urlnormalizer-host target=clean/ +ant dir=urlnormalizer-ajax target=clean/ ant dir=urlnormalizer-basic target=clean/ +ant dir=urlnormalizer-host target=clean/ ant dir=urlnormalizer-pass target=clean/ ant dir=urlnormalizer-querystring target=clean/ ant dir=urlnormalizer-regex target=clean/ Added: nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml?rev=1659167view=auto == --- nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml Thu Feb 12 08:30:31 2015 @@ -0,0 +1,22 @@ +?xml version=1.0? +!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the License); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an AS IS BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License
svn commit: r1607043 - /nutch/cms_site/trunk/templates/std.html
Author: markus Date: Tue Jul 1 11:07:57 2014 New Revision: 1607043 URL: http://svn.apache.org/r1607043 Log: have at least a title on all pages Modified: nutch/cms_site/trunk/templates/std.html Modified: nutch/cms_site/trunk/templates/std.html URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/templates/std.html?rev=1607043r1=1607042r2=1607043view=diff == --- nutch/cms_site/trunk/templates/std.html (original) +++ nutch/cms_site/trunk/templates/std.html Tue Jul 1 11:07:57 2014 @@ -314,7 +314,7 @@ under the License. script type=text/javascript src=http://w.sharethis.com/button/buttons.js;/script script type=text/javascript src=http://s.sharethis.com/loader.js;/script script type=text/javascript src=./assets/js/jquery.js/script - title{% block title %}{% endblock %} -- Apache Nutch/title + titleApache Nutch/title /head body
svn commit: r914579 - /websites/production/nutch/content/
Author: markus Date: Tue Jul 1 11:09:09 2014 New Revision: 914579 Log: Add title to pages. Added: websites/production/nutch/content/ - copied from r914578, websites/staging/nutch/trunk/content/
svn commit: r1606693 - /nutch/cms_site/trunk/content/index.md
Author: markus Date: Mon Jun 30 11:44:03 2014 New Revision: 1606693 URL: http://svn.apache.org/r1606693 Log: page title missing Modified: nutch/cms_site/trunk/content/index.md Modified: nutch/cms_site/trunk/content/index.md URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606693r1=1606692r2=1606693view=diff == --- nutch/cms_site/trunk/content/index.md (original) +++ nutch/cms_site/trunk/content/index.md Mon Jun 30 11:44:03 2014 @@ -1,3 +1,4 @@ +Welcome to Apache Nutch !-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file
svn commit: r1606694 - /nutch/cms_site/trunk/content/index.md
Author: markus Date: Mon Jun 30 11:46:31 2014 New Revision: 1606694 URL: http://svn.apache.org/r1606694 Log: Apparently the page header input box does not result in a title Modified: nutch/cms_site/trunk/content/index.md Modified: nutch/cms_site/trunk/content/index.md URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606694r1=1606693r2=1606694view=diff == --- nutch/cms_site/trunk/content/index.md (original) +++ nutch/cms_site/trunk/content/index.md Mon Jun 30 11:46:31 2014 @@ -1,4 +1,3 @@ -Welcome to Apache Nutch !-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file
svn commit: r1606695 - /nutch/cms_site/trunk/templates/std.html
Author: markus Date: Mon Jun 30 11:50:44 2014 New Revision: 1606695 URL: http://svn.apache.org/r1606695 Log: added title Modified: nutch/cms_site/trunk/templates/std.html Modified: nutch/cms_site/trunk/templates/std.html URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/templates/std.html?rev=1606695r1=1606694r2=1606695view=diff == --- nutch/cms_site/trunk/templates/std.html (original) +++ nutch/cms_site/trunk/templates/std.html Mon Jun 30 11:50:44 2014 @@ -314,7 +314,7 @@ under the License. script type=text/javascript src=http://w.sharethis.com/button/buttons.js;/script script type=text/javascript src=http://s.sharethis.com/loader.js;/script script type=text/javascript src=./assets/js/jquery.js/script - + title{% block title %}{% endblock %}/title /head body
svn commit: r1606696 - /nutch/cms_site/trunk/templates/std.html
Author: markus Date: Mon Jun 30 11:52:25 2014 New Revision: 1606696 URL: http://svn.apache.org/r1606696 Log: actually put something in the title Modified: nutch/cms_site/trunk/templates/std.html Modified: nutch/cms_site/trunk/templates/std.html URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/templates/std.html?rev=1606696r1=1606695r2=1606696view=diff == --- nutch/cms_site/trunk/templates/std.html (original) +++ nutch/cms_site/trunk/templates/std.html Mon Jun 30 11:52:25 2014 @@ -314,7 +314,7 @@ under the License. script type=text/javascript src=http://w.sharethis.com/button/buttons.js;/script script type=text/javascript src=http://s.sharethis.com/loader.js;/script script type=text/javascript src=./assets/js/jquery.js/script - title{% block title %}{% endblock %}/title + title{% block title %}{% endblock %} -- Apache Nutch/title /head body
svn commit: r1606703 - /nutch/cms_site/trunk/content/index.md
Author: markus Date: Mon Jun 30 12:01:53 2014 New Revision: 1606703 URL: http://svn.apache.org/r1606703 Log: CMS commit to nutch by markus Modified: nutch/cms_site/trunk/content/index.md Modified: nutch/cms_site/trunk/content/index.md URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606703r1=1606702r2=1606703view=diff == --- nutch/cms_site/trunk/content/index.md (original) +++ nutch/cms_site/trunk/content/index.md Mon Jun 30 12:01:53 2014 @@ -1,3 +1,4 @@ +test title? will it work? !-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file
svn commit: r1606704 - /nutch/cms_site/trunk/content/index.md
Author: markus Date: Mon Jun 30 12:03:56 2014 New Revision: 1606704 URL: http://svn.apache.org/r1606704 Log: will this work? Modified: nutch/cms_site/trunk/content/index.md Modified: nutch/cms_site/trunk/content/index.md URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606704r1=1606703r2=1606704view=diff == --- nutch/cms_site/trunk/content/index.md (original) +++ nutch/cms_site/trunk/content/index.md Mon Jun 30 12:03:56 2014 @@ -1,4 +1,3 @@ -test title? will it work? !-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file @@ -17,6 +16,7 @@ KIND, either express or implied. See th specific language governing permissions and limitations under the License. -- +{% block title %}does this work ten?{% endblock %} !-- Carousel== -- div id=myCarousel class=carousel slide div class=carousel-inner
svn commit: r1606705 - /nutch/cms_site/trunk/content/index.md
Author: markus Date: Mon Jun 30 12:04:50 2014 New Revision: 1606705 URL: http://svn.apache.org/r1606705 Log: restore stuff i broke Modified: nutch/cms_site/trunk/content/index.md Modified: nutch/cms_site/trunk/content/index.md URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606705r1=1606704r2=1606705view=diff == --- nutch/cms_site/trunk/content/index.md (original) +++ nutch/cms_site/trunk/content/index.md Mon Jun 30 12:04:50 2014 @@ -16,7 +16,6 @@ KIND, either express or implied. See th specific language governing permissions and limitations under the License. -- -{% block title %}does this work ten?{% endblock %} !-- Carousel== -- div id=myCarousel class=carousel slide div class=carousel-inner
svn commit: r1600566 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/NodeWalker.java
Author: markus Date: Thu Jun 5 08:34:01 2014 New Revision: 1600566 URL: http://svn.apache.org/r1600566 Log: NUTCH-1782 NodeWalker to return current node Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1600566r1=1600565r2=1600566view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jun 5 08:34:01 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1782 NodeWalker to return current node (markus) + * NUTCH-1758 IndexChecker to send document to IndexWriters (jnioche) * NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters (Diaa via markus) Modified: nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=1600566r1=1600565r2=1600566view=diff == --- nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Thu Jun 5 08:34:01 2014 @@ -102,6 +102,14 @@ public class NodeWalker { } /** + * Return the current node. + * @return Node + */ + public Node getCurrentNode() { +return currentNode; + } + + /** * @return returns true if there are more nodes on the current stack. * */
svn commit: r1562058 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/hostdb/HostDb.java
Author: markus Date: Tue Jan 28 13:07:09 2014 New Revision: 1562058 URL: http://svn.apache.org/r1562058 Log: NUTCH-1717 HostDB not to complain if filters/normalizers are disabled Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1562058r1=1562057r2=1562058view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jan 28 13:07:09 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1717 HostDB not to complain if filters/normalizers are disabled (markus) + * NUTCH-1715 RobotRulesParser adds additional '*' to the robots name (tejasp) * NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, DoÄacan Güney via markus) Modified: nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java?rev=1562058r1=1562057r2=1562058view=diff == --- nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java Tue Jan 28 13:07:09 2014 @@ -505,12 +505,12 @@ public class HostDb extends Configured i conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize); // Check whether the urlfilter-domainblacklist plugin is loaded -if (urlfilter-domainblacklist.matches(conf.get(plugin.includes))) { +if (filter urlfilter-domainblacklist.matches(conf.get(plugin.includes))) { throw new Exception(domainblacklist-urlfilter must not be enabled); } // Check whether the urlnormalizer-host plugin is loaded -if (urlnormalizer-host.matches(conf.get(plugin.includes))) { +if (normalize urlnormalizer-host.matches(conf.get(plugin.includes))) { throw new Exception(urlnormalizer-host must not be enabled); }
svn commit: r1560985 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/plugin/Extension.java src/java/org/apache/nutch/plugin/PluginClassLoader.java src/java/org/apache/nutch/plugin/PluginRepos
Author: markus Date: Fri Jan 24 13:12:00 2014 New Revision: 1560985 URL: http://svn.apache.org/r1560985 Log: NUTCH-356 Plugin repository cache can lead to memory leak Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560985r1=1560984r2=1560985view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 24 13:12:00 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, DoÄacan Güney via markus) + * NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel) * NUTCH-1325 HostDB for Nutch (markus, tejasp) Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java?rev=1560985r1=1560984r2=1560985view=diff == --- nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java Fri Jan 24 13:12:00 2014 @@ -33,7 +33,6 @@ public class Extension { private String fClazz; private HashMapString, String fAttributes; private Configuration conf; - private PluginRepository pluginRepository; /** * @param pDescriptor @@ -52,7 +51,6 @@ public class Extension { setId(pId); setClazz(pExtensionClass); this.conf = conf; -this.pluginRepository = pluginRepository; } /** @@ -149,12 +147,13 @@ public class Extension { // The same is in PluginRepository.getPluginInstance(). // Suggested by Stefan Groschupf s...@media-style.com synchronized (getId()) { - try { -PluginClassLoader loader = fDescriptor.getClassLoader(); -Class? extensionClazz = loader.loadClass(getClazz()); + try { +PluginRepository pluginRepository = PluginRepository.get(conf); +Class extensionClazz = + pluginRepository.getCachedClass(fDescriptor, getClazz()); // lazy loading of Plugin in case there is no instance of the plugin // already. -this.pluginRepository.getPluginInstance(getDescriptor()); +pluginRepository.getPluginInstance(getDescriptor()); Object object = extensionClazz.newInstance(); if (object instanceof Configurable) { ((Configurable) object).setConf(this.conf); Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java?rev=1560985r1=1560984r2=1560985view=diff == --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java Fri Jan 24 13:12:00 2014 @@ -18,6 +18,7 @@ package org.apache.nutch.plugin; import java.net.URL; import java.net.URLClassLoader; +import java.util.Arrays; /** * The codePluginClassLoader/code contains only classes of the runtime @@ -30,6 +31,10 @@ import java.net.URLClassLoader; * @author joa23 */ public class PluginClassLoader extends URLClassLoader { + + private URL[] urls; + private ClassLoader parent; + /** * Construtor * @@ -40,5 +45,36 @@ public class PluginClassLoader extends U */ public PluginClassLoader(URL[] urls, ClassLoader parent) { super(urls, parent); + +this.urls = urls; +this.parent = parent; + } + + @Override + public int hashCode() { +final int PRIME = 31; +int result = 1; +result = PRIME * result + ((parent == null) ? 0 : parent.hashCode()); +result = PRIME * result + Arrays.hashCode(urls); +return result; + } + + @Override + public boolean equals(Object obj) { +if (this == obj) + return true; +if (obj == null) + return false; +if (getClass() != obj.getClass()) + return false; +final PluginClassLoader other = (PluginClassLoader) obj; +if (parent == null) { + if (other.parent != null) +return false; +} else if (!parent.equals(other.parent)) + return false; +if (!Arrays.equals(urls, other.urls)) + return false; +return true; } } Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1560985r1=1560984r2=1560985view=diff == --- nutch/trunk/src/java/org
svn commit: r1559657 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReader.java
Author: markus Date: Mon Jan 20 09:29:42 2014 New Revision: 1559657 URL: http://svn.apache.org/r1559657 Log: NUTCH-1680 CrawlDbReader to dump minRetry value Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1559657r1=1559656r2=1559657view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Jan 20 09:29:42 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1680 CrawlDbReader to dump minRetry value (markus) + * NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel via lewismc) * NUTCH-1695 Add NutchDocument.toString() to ease debugging (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1559657r1=1559656r2=1559657view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon Jan 20 09:29:42 2014 @@ -407,7 +407,7 @@ public class CrawlDbReader implements Cl } } - public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status) throws IOException { + public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry) throws IOException { if (LOG.isInfoEnabled()) { LOG.info(CrawlDb dump: starting); LOG.info(CrawlDb db: + crawlDb); @@ -433,7 +433,8 @@ public class CrawlDbReader implements Cl if (status != null) job.set(status, status); if (regex != null) job.set(regex, regex); - +if (retry != null) job.setInt(retry, retry); + job.setMapperClass(CrawlDbDumpMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); @@ -446,17 +447,26 @@ public class CrawlDbReader implements Cl Pattern pattern = null; Matcher matcher = null; String status = null; +Integer retry = null; public void configure(JobConf job) { if (job.get(regex, null) != null) { pattern = Pattern.compile(job.get(regex)); } status = job.get(status, null); + retry = job.getInt(retry, -1); } public void close() {} public void map(Text key, CrawlDatum value, OutputCollectorText, CrawlDatum output, Reporter reporter) throws IOException { + + // check retry + if (retry != -1) { +if (value.getRetriesSinceFetch() retry) { + return; +} + } // check status if (status != null @@ -542,6 +552,7 @@ public class CrawlDbReader implements Cl System.err.println(\t\t[-format normal]\tdump in standard format (default option)); System.err.println(\t\t[-format crawldb]\tdump as CrawlDB); System.err.println(\t\t[-regex expr]\tfilter records with expression); + System.err.println(\t\t[-retry num]\tminimum retry count); System.err.println(\t\t[-status status]\tfilter records by CrawlDatum status); System.err.println(\t-url url\tprint information on url to System.out); System.err.println(\t-topN out_dir [min]\tdump top urls sorted by score to out_dir); @@ -564,6 +575,7 @@ public class CrawlDbReader implements Cl param = args[++i]; String format = normal; String regex = null; +Integer retry = null; String status = null; for (int j = i + 1; j args.length; j++) { if (args[j].equals(-format)) { @@ -574,12 +586,16 @@ public class CrawlDbReader implements Cl regex = args[++j]; i=i+2; } + if (args[j].equals(-retry)) { +retry = Integer.parseInt(args[++j]); +i=i+2; + } if (args[j].equals(-status)) { status = args[++j]; i=i+2; } } -dbr.processDumpJob(crawlDb, param, conf, format, regex, status); +dbr.processDumpJob(crawlDb, param, conf, format, regex, status, retry); } else if (args[i].equals(-url)) { param = args[++i]; dbr.readUrl(crawlDb, param, conf);
svn commit: r1556474 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/NutchDocument.java
Author: markus Date: Wed Jan 8 09:39:47 2014 New Revision: 1556474 URL: http://svn.apache.org/r1556474 Log: NUTCH-1695 Add NutchDocument.toString() to ease debugging Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1556474r1=1556473r2=1556474view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 8 09:39:47 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1695 Add NutchDocument.toString() to ease debugging (markus) + * NUTCH-1675 NutchField to support long (markus) * NUTCH-1670 set same crawldb directory in mergedb parameter (lufeng via tejasp) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java?rev=1556474r1=1556473r2=1556474view=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java Wed Jan 8 09:39:47 2014 @@ -127,4 +127,18 @@ implements Writable, IterableEntryStri out.writeFloat(weight); documentMeta.write(out); } + + public String toString() { +StringBuilder sb = new StringBuilder(); +sb.append(doc {\n); +for (Map.EntryString, NutchField entry : fields.entrySet()) { + sb.append(\t); + sb.append(entry.getKey()); + sb.append(:\t); + sb.append(entry.getValue()); + sb.append(\n); +} +sb.append(}\n); +return sb.toString(); + } }
svn commit: r1554791 - /nutch/trunk/conf/nutch-default.xml
Author: markus Date: Thu Jan 2 11:53:36 2014 New Revision: 1554791 URL: http://svn.apache.org/r1554791 Log: NUTCH-1360 fix entity in configuration Modified: nutch/trunk/conf/nutch-default.xml Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1554791r1=1554790r2=1554791view=diff == --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Thu Jan 2 11:53:36 2014 @@ -29,7 +29,7 @@ valuefalse/value descriptionEnables us to capture the specific IP address (InetSocketAddress) of the host which we connect to via - the given protocol. Currently supported is protocol-ftp + the given protocol. Currently supported is protocol-ftp and http. /description /property
svn commit: r1553115 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/URLUtil.java src/test/org/apache/nutch/util/TestURLUtil.java
Author: markus Date: Mon Dec 23 14:17:40 2013 New Revision: 1553115 URL: http://svn.apache.org/r1553115 Log: NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1553115r1=1553114r2=1553115view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Dec 23 14:17:40 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly (İlhami KALKAN, snagel via markus) + * NUTCH-1668 Remove package org.apache.nutch.indexer.solr (jnioche) * NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1553115r1=1553114r2=1553115view=diff == --- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Mon Dec 23 14:17:40 2013 @@ -481,7 +481,7 @@ public class URLUtil { try { URL u = new URL(url); URI p = new URI(u.getProtocol(), -null, +u.getUserInfo(), IDN.toASCII(u.getHost()), u.getPort(), u.getPath(), @@ -498,15 +498,25 @@ public class URLUtil { public static String toUNICODE(String url) { try { URL u = new URL(url); - URI p = new URI(u.getProtocol(), -null, -IDN.toUnicode(u.getHost()), -u.getPort(), -u.getPath(), -u.getQuery(), -u.getRef()); + StringBuilder sb = new StringBuilder(); + sb.append(u.getProtocol()); + sb.append(://); + if (u.getUserInfo() != null) { +sb.append(u.getUserInfo()); +sb.append('@'); + } + sb.append(IDN.toUnicode(u.getHost())); + if (u.getPort() != -1) { +sb.append(':'); +sb.append(u.getPort()); + } + sb.append(u.getFile()); // includes query + if (u.getRef() != null) { +sb.append('#'); +sb.append(u.getRef()); + } - return p.toString(); + return sb.toString(); } catch (Exception e) { return null; Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1553115r1=1553114r2=1553115view=diff == --- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original) +++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Mon Dec 23 14:17:40 2013 @@ -258,5 +258,22 @@ public class TestURLUtil assertEquals(targets[i][1], targets[i][1], u.toString()); } } + + public void testToUNICODE() throws Exception { +assertEquals(http://www.çevir.com;, URLUtil.toUNICODE(http://www.xn--evir-zoa.com;)); +assertEquals(http://uni-tübingen.de/;, URLUtil.toUNICODE(http://xn--uni-tbingen-xhb.de/;)); +assertEquals( +http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1;, + URLUtil.toUNICODE(http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1;)); + + } + + public void testToASCII() throws Exception { +assertEquals(http://www.xn--evir-zoa.com;, URLUtil.toASCII(http://www.çevir.com;)); +assertEquals(http://xn--uni-tbingen-xhb.de/;, URLUtil.toASCII(http://uni-tübingen.de/;)); +assertEquals( +http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1;, + URLUtil.toASCII(http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1;)); + } }
svn commit: r1528072 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: markus Date: Tue Oct 1 12:50:06 2013 New Revision: 1528072 URL: http://svn.apache.org/r1528072 Log: NUTCH-1646 IndexerMapReduce to consider DB status Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1528072r1=1528071r2=1528072view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Oct 1 12:50:06 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1646 IndexerMapReduce to consider DB status (markus) + * NUTCH-1636 Indexer to normalize and filter repr URL (Iain Lopata via snagel) * NUTCH-1637 URLUtil is missing getProtocol (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1528072r1=1528071r2=1528072view=diff == --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Oct 1 12:50:06 2013 @@ -189,14 +189,18 @@ implements MapperText, Writable, Text, * Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT. */ if (delete) { - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) { + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { reporter.incrCounter(IndexerStatus, Documents deleted, 1); NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); output.collect(key, action); return; } - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) { + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM || + fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { +reporter.incrCounter(IndexerStatus, Deleted redirects, 1); reporter.incrCounter(IndexerStatus, Perm redirects deleted, 1); NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
svn commit: r1499948 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentMerger.java
Author: markus Date: Fri Jul 5 08:52:51 2013 New Revision: 1499948 URL: http://svn.apache.org/r1499948 Log: NUTCH-1520 SegmentMerger looses records Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499948r1=1499947r2=1499948view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jul 5 08:52:51 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1520 SegmentMerger looses records (markus) + * NUTCH-1602 improve the readability of metadata in readdb dump normal (lufeng) * NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus) Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1499948r1=1499947r2=1499948view=diff == --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Fri Jul 5 08:52:51 2013 @@ -412,10 +412,14 @@ public class SegmentMerger extends Confi lastF = val; lastFname = sp.segmentName; } else { -// take newer -if (lastFname.compareTo(sp.segmentName) 0) { - lastF = val; - lastFname = sp.segmentName; +// only consider fetch status +// https://issues.apache.org/jira/browse/NUTCH-1520 +if (CrawlDatum.hasFetchStatus(val)) { + // take newer + if (lastFname.compareTo(sp.segmentName) 0) { +lastF = val; +lastFname = sp.segmentName; + } } } } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) { @@ -480,7 +484,7 @@ public class SegmentMerger extends Confi linked.isEmpty() ? null : linked.lastEntry().getValue())){ return; } - + curCount++; String sliceName = null; MetaWrapper wrapper = new MetaWrapper();
svn commit: r1499959 - in /nutch/branches/2.x: CHANGES.txt ivy/ivy.xml src/plugin/parse-tika/howto_upgrade_tika.txt src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml
Author: markus Date: Fri Jul 5 10:27:47 2013 New Revision: 1499959 URL: http://svn.apache.org/r1499959 Log: NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus) Added: nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/ivy/ivy.xml nutch/branches/2.x/src/plugin/parse-tika/ivy.xml nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1499959r1=1499958r2=1499959view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Jul 5 10:27:47 2013 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus) + * NUTCH-1594 count variable is never changed in ParseUtil class (Canan via Feng) Release 2.2.1 - 06/27/2013 (mm/dd/) Modified: nutch/branches/2.x/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1499959r1=1499958r2=1499959view=diff == --- nutch/branches/2.x/ivy/ivy.xml (original) +++ nutch/branches/2.x/ivy/ivy.xml Fri Jul 5 10:27:47 2013 @@ -58,7 +58,7 @@ /dependency dependency org=com.ibm.icu name=icu4j rev=4.0.1 / -dependency org=org.apache.tika name=tika-core rev=1.3 / +dependency org=org.apache.tika name=tika-core rev=1.4 / dependency org=com.googlecode.juniversalchardet name=juniversalchardet rev=1.0.3/ dependency org=log4j name=log4j rev=1.2.15 conf=*-master / Added: nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt?rev=1499959view=auto == --- nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt (added) +++ nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt Fri Jul 5 10:27:47 2013 @@ -0,0 +1,8 @@ +1. Upgrade Tika depencency in trunk/ivy/ivy.xml + +2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml + +3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml + To get the list of dependencies and their versions execute: + $ ant -f ./build-ivy.xml + $ ls lib/ Modified: nutch/branches/2.x/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/ivy.xml?rev=1499959r1=1499958r2=1499959view=diff == --- nutch/branches/2.x/src/plugin/parse-tika/ivy.xml (original) +++ nutch/branches/2.x/src/plugin/parse-tika/ivy.xml Fri Jul 5 10:27:47 2013 @@ -36,7 +36,7 @@ /publications dependencies -dependency org=org.apache.tika name=tika-parsers rev=1.3 conf=*-default +dependency org=org.apache.tika name=tika-parsers rev=1.4 conf=*-default exclude org=org.apache.tika name=tika-core / /dependency /dependencies Modified: nutch/branches/2.x/src/plugin/parse-tika/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/plugin.xml?rev=1499959r1=1499958r2=1499959view=diff == --- nutch/branches/2.x/src/plugin/parse-tika/plugin.xml (original) +++ nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Fri Jul 5 10:27:47 2013 @@ -28,39 +28,39 @@ library name=apache-mime4j-core-0.7.2.jar/ library name=apache-mime4j-dom-0.7.2.jar/ - library name=asm-3.1.jar/ + library name=asm-5.1.jar/ library name=aspectjrt-1.6.11.jar/ library name=bcmail-jdk15-1.45.jar/ library name=bcprov-jdk15-1.45.jar/ library name=boilerpipe-1.1.0.jar/ library name=commons-codec-1.5.jar/ - library name=commons-compress-1.4.1.jar/ + library name=commons-compress-1.5.jar/ library name=commons-logging-1.1.1.jar/ library name=dom4j-1.6.1.jar/ - library name=fontbox-1.7.1.jar/ + library name=fontbox-1.8.1.jar/ library name=geronimo-stax-api_1.0_spec-1.0.1.jar/ library name=isoparser-1.0-RC-1.jar/ library name=jdom-1.0.jar/ - library name=jempbox-1.7.1.jar/ + library name=jempbox-1.8.1.jar/ library name=juniversalchardet-1.0.3.jar/ library name=metadata-extractor-2.6.2.jar/ library name=netcdf-4.2-min.jar/ - library name=pdfbox-1.7.1.jar/ - library name=poi-3.8.jar/ - library name=poi-ooxml-3.8.jar/ - library name=poi-ooxml-schemas-3.8.jar/ - library name=poi-scratchpad-3.8.jar/ + library name=pdfbox-1.8.1.jar/ + library name=poi-3.9.jar/ + library name=poi-ooxml-3.9.jar/ + library name=poi-ooxml-schemas-3.9.jar/ + library name=poi-scratchpad-3.9.jar/ library name
svn commit: r1499960 - in /nutch/trunk: CHANGES.txt ivy/ivy.xml src/plugin/parse-tika/howto_upgrade_tika.txt src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml
Author: markus Date: Fri Jul 5 10:28:46 2013 New Revision: 1499960 URL: http://svn.apache.org/r1499960 Log: NUTCH-1595 Upgrade to Tika 1.4 Added: nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt Modified: nutch/trunk/CHANGES.txt nutch/trunk/ivy/ivy.xml nutch/trunk/src/plugin/parse-tika/ivy.xml nutch/trunk/src/plugin/parse-tika/plugin.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499960r1=1499959r2=1499960view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jul 5 10:28:46 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus) + * NUTCH-1598 ElasticSearchIndexer to read ImmutableSettings from config (markus) * NUTCH-1520 SegmentMerger looses records (markus) Modified: nutch/trunk/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1499960r1=1499959r2=1499960view=diff == --- nutch/trunk/ivy/ivy.xml (original) +++ nutch/trunk/ivy/ivy.xml Fri Jul 5 10:28:46 2013 @@ -64,7 +64,7 @@ exclude org=ant name=ant / /dependency - dependency org=org.apache.tika name=tika-core rev=1.3 / + dependency org=org.apache.tika name=tika-core rev=1.4 / dependency org=com.ibm.icu name=icu4j rev=4.0.1 / dependency org=org.mortbay.jetty name=jetty-client Added: nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt?rev=1499960view=auto == --- nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt (added) +++ nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt Fri Jul 5 10:28:46 2013 @@ -0,0 +1,8 @@ +1. Upgrade Tika depencency in trunk/ivy/ivy.xml + +2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml + +3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml + To get the list of dependencies and their versions execute: + $ ant -f ./build-ivy.xml + $ ls lib/ Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1499960r1=1499959r2=1499960view=diff == --- nutch/trunk/src/plugin/parse-tika/ivy.xml (original) +++ nutch/trunk/src/plugin/parse-tika/ivy.xml Fri Jul 5 10:28:46 2013 @@ -36,7 +36,7 @@ /publications dependencies -dependency org=org.apache.tika name=tika-parsers rev=1.3 conf=*-default +dependency org=org.apache.tika name=tika-parsers rev=1.4 conf=*-default exclude org=org.apache.tika name=tika-core / /dependency /dependencies Modified: nutch/trunk/src/plugin/parse-tika/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=1499960r1=1499959r2=1499960view=diff == --- nutch/trunk/src/plugin/parse-tika/plugin.xml (original) +++ nutch/trunk/src/plugin/parse-tika/plugin.xml Fri Jul 5 10:28:46 2013 @@ -28,39 +28,39 @@ library name=apache-mime4j-core-0.7.2.jar/ library name=apache-mime4j-dom-0.7.2.jar/ - library name=asm-3.1.jar/ + library name=asm-4.1.jar/ library name=aspectjrt-1.6.11.jar/ library name=bcmail-jdk15-1.45.jar/ library name=bcprov-jdk15-1.45.jar/ library name=boilerpipe-1.1.0.jar/ library name=commons-codec-1.5.jar/ - library name=commons-compress-1.4.1.jar/ + library name=commons-compress-1.5.jar/ library name=commons-logging-1.1.1.jar/ library name=dom4j-1.6.1.jar/ - library name=fontbox-1.7.1.jar/ + library name=fontbox-1.8.1.jar/ library name=geronimo-stax-api_1.0_spec-1.0.1.jar/ library name=isoparser-1.0-RC-1.jar/ library name=jdom-1.0.jar/ - library name=jempbox-1.7.1.jar/ + library name=jempbox-1.8.1.jar/ library name=juniversalchardet-1.0.3.jar/ library name=metadata-extractor-2.6.2.jar/ library name=netcdf-4.2-min.jar/ - library name=pdfbox-1.7.1.jar/ - library name=poi-3.8.jar/ - library name=poi-ooxml-3.8.jar/ - library name=poi-ooxml-schemas-3.8.jar/ - library name=poi-scratchpad-3.8.jar/ + library name=pdfbox-1.8.1.jar/ + library name=poi-3.9.jar/ + library name=poi-ooxml-3.9.jar/ + library name=poi-ooxml-schemas-3.9.jar/ + library name=poi-scratchpad-3.9.jar/ library name=rome-0.9.jar/ library name=slf4j-api-1.5.6.jar/ library name=tagsoup-1.2.1.jar/ - library name=tika-parsers-1.3.jar/ + library name=tika
svn commit: r1499684 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java
Author: markus Date: Thu Jul 4 08:50:25 2013 New Revision: 1499684 URL: http://svn.apache.org/r1499684 Log: NUTCH-1600 Injector overwrite does not always work properly Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499684r1=1499683r2=1499684view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jul 4 08:50:25 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1600 Injector overwrite does not always work properly (markus) + * NUTCH-1581 CrawlDB csv output to include metadata (markus) * NUTCH-1327 QueryStringNormalizer (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1499684r1=1499683r2=1499684view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Jul 4 08:50:25 2013 @@ -186,6 +186,8 @@ public class Injector extends Configured scoreInjected = job.getFloat(db.score.injected, 1.0f); overwrite = job.getBoolean(db.injector.overwrite, false); update = job.getBoolean(db.injector.update, false); + LOG.info(Injector: overwrite: + overwrite); + LOG.info(Injector: update: + update); } public void close() {} @@ -209,22 +211,20 @@ public class Injector extends Configured oldSet = true; } } + CrawlDatum res = null; + + // Old default behaviour + if (injectedSet !oldSet) { +res = injected; + } else { +res = old; + } /** * Whether to overwrite, ignore or update existing records * @see https://issues.apache.org/jira/browse/NUTCH-1405 */ - - // Injected record already exists and overwrite but not update - if (injectedSet oldSet overwrite) { -res = injected; - -if (update) { - LOG.info(key.toString() + overwritten with injected record but update was specified.); -} - } - // Injected record already exists and update but not overwrite if (injectedSet oldSet update !overwrite) { res = old; @@ -233,11 +233,9 @@ public class Injector extends Configured old.setFetchInterval(injected.getFetchInterval() != interval ? injected.getFetchInterval() : old.getFetchInterval()); } - // Old default behaviour - if (injectedSet !oldSet) { + // Injected record already exists and overwrite + if (injectedSet oldSet overwrite) { res = injected; - } else { -res = old; } output.collect(key, res);
svn commit: r1499696 - in /nutch/trunk: CHANGES.txt src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Author: markus Date: Thu Jul 4 09:07:12 2013 New Revision: 1499696 URL: http://svn.apache.org/r1499696 Log: NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499696r1=1499695r2=1499696view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jul 4 09:07:12 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus) + * NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus) * NUTCH-1600 Injector overwrite does not always work properly (markus) Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499696r1=1499695r2=1499696view=diff == --- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (original) +++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Thu Jul 4 09:07:12 2013 @@ -19,6 +19,7 @@ package org.apache.nutch.parse.headings; import java.util.ArrayList; import java.util.List; +import java.util.regex.*; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.Parse; @@ -33,6 +34,11 @@ import org.w3c.dom.*; */ public class HeadingsParseFilter implements HtmlParseFilter { + /** + * Pattern used to strip surpluss whitespace + */ + protected static Pattern whitespacePattern = Pattern.compile(\\s+); + private Configuration conf; private DocumentFragment doc; private String[] headings; @@ -113,6 +119,8 @@ public class HeadingsParseFilter impleme } } -return buffer.toString(); +// Return with stripped surplus whitespace +Matcher matcher = whitespacePattern.matcher(buffer.toString().trim()); +return matcher.replaceAll( ).trim(); } }
svn commit: r1499722 - in /nutch/trunk: CHANGES.txt src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Author: markus Date: Thu Jul 4 11:13:34 2013 New Revision: 1499722 URL: http://svn.apache.org/r1499722 Log: NUTCH-1596 HeadingsParseFilter not thread safe Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499722r1=1499721r2=1499722view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jul 4 11:13:34 2013 @@ -2,7 +2,9 @@ Nutch Change Log Nutch Development Trunk -* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus) +* NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus) + +* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus) * NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus) Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499722r1=1499721r2=1499722view=diff == --- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (original) +++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Thu Jul 4 11:13:34 2013 @@ -40,17 +40,14 @@ public class HeadingsParseFilter impleme protected static Pattern whitespacePattern = Pattern.compile(\\s+); private Configuration conf; - private DocumentFragment doc; private String[] headings; private boolean multiValued = false; public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { -this.doc = doc; - Parse parse = parseResult.get(content.getUrl()); for (int i = 0 ; headings != null i headings.length ; i++ ) { - ListString discoveredHeadings = getElement(headings[i]); + ListString discoveredHeadings = getElement(doc, headings[i]); if (discoveredHeadings.size() 0) { for (String heading : discoveredHeadings) { @@ -82,7 +79,7 @@ public class HeadingsParseFilter impleme /** * Finds the specified element and returns its value */ - protected ListString getElement(String element) { + protected ListString getElement(DocumentFragment doc, String element) { ListString headings = new ArrayListString(); NodeWalker walker = new NodeWalker(doc);
svn commit: r1498830 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReader.java
Author: markus Date: Tue Jul 2 08:36:13 2013 New Revision: 1498830 URL: http://svn.apache.org/r1498830 Log: NUTCH-1327 QueryStringNormalizer Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498830r1=1498829r2=1498830view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jul 2 08:36:13 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1327 QueryStringNormalizer (markus) + * NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus) * NUTCH-1580 index-static returns object instead of value for index.static (Antoinette, lewismc, snagel) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1498830r1=1498829r2=1498830view=diff == --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Tue Jul 2 08:36:13 2013 @@ -24,6 +24,7 @@ import java.net.URL; import java.util.Date; import java.util.Iterator; import java.util.Map; +import java.util.Map.Entry; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -41,6 +42,7 @@ import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; @@ -64,42 +66,41 @@ import org.apache.nutch.util.StringUtil; /** * Read utility for the CrawlDB. - * + * * @author Andrzej Bialecki - * + * */ public class CrawlDbReader implements Closeable { public static final Logger LOG = LoggerFactory.getLogger(CrawlDbReader.class); private MapFile.Reader[] readers = null; - + private void openReaders(String crawlDb, Configuration config) throws IOException { if (readers != null) return; FileSystem fs = FileSystem.get(config); readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config); } - + private void closeReaders() { if (readers == null) return; for (int i = 0; i readers.length; i++) { try { readers[i].close(); } catch (Exception e) { - + } } } - + public static class CrawlDatumCsvOutputFormat extends FileOutputFormatText,CrawlDatum { protected static class LineRecordWriter implements RecordWriterText,CrawlDatum { private DataOutputStream out; - public LineRecordWriter(DataOutputStream out) { this.out = out; try { - out.writeBytes(Url;Status code;Status name;Fetch Time;Modified Time;Retries since fetch;Retry interval seconds;Retry interval days;Score;Signature\n); + out.writeBytes(Url;Status code;Status name;Fetch Time;Modified Time;Retries since fetch;Retry interval seconds;Retry interval days;Score;Signature;Metadata\n); } catch (IOException e) {} } @@ -129,6 +130,18 @@ public class CrawlDbReader implements Cl out.writeByte(''); out.writeBytes(value.getSignature() != null ? StringUtil.toHexString(value.getSignature()): null); out.writeByte(''); + out.writeByte(';'); + out.writeByte(''); + if (value.getMetaData() != null) { +for (EntryWritable, Writable e : value.getMetaData().entrySet()) { + out.writeBytes(e.getKey().toString()); + out.writeByte(':'); + out.writeBytes(e.getValue().toString()); + out.writeBytes(|||); +} + } + out.writeByte(''); + out.writeByte('\n'); } @@ -165,10 +178,10 @@ public class CrawlDbReader implements Cl } } } - + public static class CrawlDbStatCombiner implements ReducerText, LongWritable, Text, LongWritable { LongWritable val = new LongWritable(); - + public CrawlDbStatCombiner() { } public void configure(JobConf job) { } public void close() {} @@ -249,7 +262,7 @@ public class CrawlDbReader implements Cl public static class CrawlDbTopNMapper implements MapperText, CrawlDatum, FloatWritable, Text { private static final FloatWritable fw = new FloatWritable(); private float min = 0.0f; - + public void configure(JobConf job) { long lmin = job.getLong(db.reader.topn.min, 0); if (lmin != 0) { @@ -264,11 +277,11 @@ public class CrawlDbReader implements Cl output.collect(fw, key
svn commit: r1498832 - in /nutch/trunk: ./ src/plugin/ src/plugin/urlnormalizer-querystring/ src/plugin/urlnormalizer-querystring/src/ src/plugin/urlnormalizer-querystring/src/java/ src/plugin/urlnorm
Author: markus Date: Tue Jul 2 08:37:40 2013 New Revision: 1498832 URL: http://svn.apache.org/r1498832 Log: NUTCH-1581 CrawlDB csv output to include metadata Added: nutch/trunk/src/plugin/urlnormalizer-querystring/ nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml nutch/trunk/src/plugin/urlnormalizer-querystring/src/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498832r1=1498831r2=1498832view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jul 2 08:37:40 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1581 CrawlDB csv output to include metadata (markus) + * NUTCH-1327 QueryStringNormalizer (markus) * NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus) Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1498832r1=1498831r2=1498832view=diff == --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Tue Jul 2 08:37:40 2013 @@ -70,6 +70,7 @@ ant dir=urlnormalizer-basic target=deploy/ ant dir=urlnormalizer-host target=deploy/ ant dir=urlnormalizer-pass target=deploy/ + ant dir=urlnormalizer-querystring target=deploy/ ant dir=urlnormalizer-regex target=deploy/ /target @@ -105,6 +106,7 @@ ant dir=urlnormalizer-basic target=test/ ant dir=urlnormalizer-host target=test/ ant dir=urlnormalizer-pass target=test/ + ant dir=urlnormalizer-querystring target=test/ ant dir=urlnormalizer-regex target=test/ /parallel /target @@ -159,6 +161,7 @@ ant dir=urlnormalizer-host target=clean/ ant dir=urlnormalizer-basic target=clean/ ant dir=urlnormalizer-pass target=clean/ +ant dir=urlnormalizer-querystring target=clean/ ant dir=urlnormalizer-regex target=clean/ /target /project Added: nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml?rev=1498832view=auto == --- nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml Tue Jul 2 08:37:40 2013 @@ -0,0 +1,22 @@ +?xml version=1.0? +!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the License); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an AS IS BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +-- +project name=urlnormalizer
svn commit: r1498346 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentMerger.java
Author: markus Date: Mon Jul 1 10:03:12 2013 New Revision: 1498346 URL: http://svn.apache.org/r1498346 Log: NUTCH-1593 Normalize option missing in SegmentMerger's usage Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498346r1=1498345r2=1498346view=diff == --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Jul 1 10:03:12 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus) + * NUTCH-1580 index-static returns object instead of value for index.static (Antoinette, lewismc, snagel) * NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus) Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1498346r1=1498345r2=1498346view=diff == --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Mon Jul 1 10:03:12 2013 @@ -649,6 +649,7 @@ public class SegmentMerger extends Confi System.err.println(\t-dir segments\tparent dir containing several segments); System.err.println(\tseg1 seg2 ...\tlist of segment dirs); System.err.println(\t-filter\t\tfilter out URL-s prohibited by current URLFilters); + System.err.println(\t-normalize\t\tnormalize URL via current URLNormalizers); System.err.println(\t-slice \tcreate many output segments, each containing URLs); return; }
svn commit: r1496023 - in /nutch/branches/2.x: ./ src/plugin/ src/plugin/urlfilter-prefix/src/test/ src/plugin/urlfilter-prefix/src/test/org/ src/plugin/urlfilter-prefix/src/test/org/apache/ src/plugi
Author: markus Date: Mon Jun 24 13:12:59 2013 New Revision: 1496023 URL: http://svn.apache.org/r1496023 Log: NUTCH-1126 JUnit test for urlfilter-prefix Added: nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/build.xml Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1496023r1=1496022r2=1496023view=diff == --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Mon Jun 24 13:12:59 2013 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus) + * NUTCH-1585 Ensure duplicate tags do not exist in microformat-reltag tag set (lewismc) * NUTCH-1475 Index-More Plugin -- A better fall back value for date field (James Sullivan, snagel via lewismc) Modified: nutch/branches/2.x/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1496023r1=1496022r2=1496023view=diff == --- nutch/branches/2.x/src/plugin/build.xml (original) +++ nutch/branches/2.x/src/plugin/build.xml Mon Jun 24 13:12:59 2013 @@ -81,7 +81,8 @@ ant dir=language-identifier target=test/ ant dir=protocol-httpclient target=test/ ant dir=urlfilter-automaton target=test/ - ant dir=urlfilter-domain target=test / + ant dir=urlfilter-domain target=test/ + ant dir=urlfilter-prefix target=test/ ant dir=urlfilter-regex target=test/ ant dir=urlfilter-suffix target=test/ ant dir=urlnormalizer-basic target=test/ Added: nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java?rev=1496023view=auto == --- nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java (added) +++ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java Mon Jun 24 13:12:59 2013 @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.prefix; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +import java.io.IOException; + + +/** + * JUnit test for codePrefixURLFilter/code. + * + * @author Talat Uyarer + * @author Cihad Guzel + */ +public class TestPrefixURLFilter extends TestCase { + private static final String prefixes = +# this is a comment\n + +\n + +http://\n; + +https://\n; + +file://\n + +ftp://\n;; + + private static final String[] urls = new String[] { +http://www.example.com/;, +https://www.example.com/;, +ftp://www.example.com/;, +file://www.example.com/, +abcd://www.example.com/, +www.example.com/, + }; + + private static String[] urlsModeAccept = new String[] { +urls[0], +urls[1], +urls[2], +urls[3], +null, +null + }; + + private PrefixURLFilter filter = null; + + public static Test suite() { +return new TestSuite(TestPrefixURLFilter.class); + } + + public static void main(String[] args) { +TestRunner.run(suite()); + } + + public void setUp() throws IOException { +filter = new PrefixURLFilter(prefixes