[ 
https://issues.apache.org/jira/browse/NUTCH-1106?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16546452#comment-16546452
 ] 

ASF GitHub Bot commented on NUTCH-1106:
---------------------------------------

sebastian-nagel closed pull request #359: NUTCH-1106 Options to skip url's 
based on length
URL: https://github.com/apache/nutch/pull/359
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ccce56b20..d7ad16f09 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -700,6 +700,21 @@
   </description>
 </property>
 
+<property>
+  <name>db.max.outlink.length</name>
+  <value>4096</value>
+  <description>
+    The maximum length in characters accepted for outlinks before
+    applying URL normalizers and filters.  If this value is
+    nonnegative (>=0), only URLs with a length in characters less or
+    equal than db.max.outlink.length are accepted and then passed to
+    URL normalizers and filters. Doing the length check beforehand
+    avoids that normalizers or filters hang up on overlong URLs.
+    Note: this property is only used to check URLs found as outlinks
+    and redirects, but not for injected URLs.
+  </description>
+</property>
+
 <property>
   <name>db.parsemeta.to.crawldb</name>
   <value></value>
diff --git a/conf/regex-urlfilter.txt.template 
b/conf/regex-urlfilter.txt.template
index b060cbb7b..4319bf164 100644
--- a/conf/regex-urlfilter.txt.template
+++ b/conf/regex-urlfilter.txt.template
@@ -26,6 +26,9 @@
 # skip file: ftp: and mailto: urls
 -^(file|ftp|mailto):
 
+# skip URLs longer than 2048 characters, see also db.max.outlink.length
+#-^.{2049,}
+
 # skip image and other suffixes we can't yet parse
 # for a more extensive coverage use the urlfilter-suffix plugin
 
-(?i)\.(gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 6e86a8f59..088b0fdd9 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -95,8 +95,8 @@
   private String ignoreExternalLinksMode;
 
   // Used by fetcher.follow.outlinks.depth in parse
-  private int maxOutlinksPerPage;
   private final int maxOutlinks;
+  private final int maxOutlinkLength;
   private final int interval;
   private int maxOutlinkDepth;
   private int maxOutlinkDepthNumLinks;
@@ -197,9 +197,11 @@ public FetcherThread(Configuration conf, AtomicInteger 
activeThreads, FetchItemQ
         Thread.currentThread().getId(), queueMode);
     this.maxRedirect = conf.getInt("http.redirect.max", 3);
 
-    maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
+    int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
     maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
         : maxOutlinksPerPage;
+    int maxOutlinkL = conf.getInt("db.max.outlink.length", 4096);
+    maxOutlinkLength = (maxOutlinkL < 0) ? Integer.MAX_VALUE : maxOutlinkL;
     interval = conf.getInt("db.fetch.interval.default", 2592000);
     ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
     ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
@@ -482,6 +484,9 @@ public void run() {
   private Text handleRedirect(FetchItem fit, String newUrl,
       boolean temp, String redirType)
       throws MalformedURLException, URLFilterException, InterruptedException {
+    if (newUrl.length() > maxOutlinkLength) {
+      return null;
+    }
     newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
     newUrl = urlFilters.filter(newUrl);
     String urlString = fit.url.toString();
@@ -721,6 +726,9 @@ private ParseStatus output(Text key, CrawlDatum datum, 
Content content,
           for (int i = 0; i < links.length && validCount < outlinksToStore; 
i++) {
             String toUrl = links[i].getToUrl();
 
+            if (toUrl.length() > maxOutlinkLength) {
+              continue;
+            }
             toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
                 origin, ignoreInternalLinks, ignoreExternalLinks,
                 ignoreExternalLinksMode, urlFiltersForOutlinks,
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index d24f9ce4b..82e3c9a86 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -162,9 +162,12 @@ public String getUniqueFile(TaskAttemptContext context, 
String name){
     final boolean storeText = conf.getBoolean("parser.store.text", true);
 
     int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
-    final boolean isParsing = conf.getBoolean("fetcher.parse", true);
     final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
         : maxOutlinksPerPage;
+    int maxOutlinkL = conf.getInt("db.max.outlink.length", 4096);
+    final int maxOutlinkLength = (maxOutlinkL < 0) ? Integer.MAX_VALUE
+        : maxOutlinkL;
+    final boolean isParsing = conf.getBoolean("fetcher.parse", true);
     final CompressionType compType = SequenceFileOutputFormat
         .getOutputCompressionType(context);
     Path out = FileOutputFormat.getOutputPath(context);
@@ -301,6 +304,9 @@ public void write(Text key, Parse parse) throws IOException 
{
 
           // only normalize and filter if fetcher.parse = false
           if (!isParsing) {
+            if (toUrl.length() > maxOutlinkLength) {
+              continue;
+            }
             toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
                 ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
             if (toUrl == null) {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Options to skip url's based on length
> -------------------------------------
>
>                 Key: NUTCH-1106
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1106
>             Project: Nutch
>          Issue Type: Improvement
>          Components: linkdb
>    Affects Versions: 1.3
>            Reporter: Markus Jelsma
>            Assignee: Sebastian Nagel
>            Priority: Major
>             Fix For: 1.15
>
>         Attachments: NUTCH-1106-1.4-1.patch
>
>
> Adds option to skip URL's exceeding a certain length. At first we used regex 
> to impose this limit but having this options configurable is more convenient. 
> Comments?



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to