[ 
https://issues.apache.org/jira/browse/NUTCH-1932?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16218817#comment-16218817
 ] 

ASF GitHub Bot commented on NUTCH-1932:
---------------------------------------

sebastian-nagel closed pull request #211: NUTCH-1932 Automatically remove 
orphaned pages
URL: https://github.com/apache/nutch/pull/211
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/build.xml b/build.xml
index a9cf1e927..312bdfabd 100644
--- a/build.xml
+++ b/build.xml
@@ -217,6 +217,7 @@
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
+      <packageset dir="${plugins.dir}/scoring-orphan/src/java"/>
       <packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
       <packageset dir="${plugins.dir}/subcollection/src/java"/>
       <packageset dir="${plugins.dir}/tld/src/java"/>
@@ -667,6 +668,7 @@
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
+      <packageset dir="${plugins.dir}/scoring-orphan/src/java"/>
       <packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
       <packageset dir="${plugins.dir}/subcollection/src/java"/>
       <packageset dir="${plugins.dir}/tld/src/java"/>
@@ -1100,6 +1102,8 @@
         <source path="${plugins.dir}/scoring-depth/src/java/" />
         <source path="${plugins.dir}/scoring-link/src/java/" />
         <source path="${plugins.dir}/scoring-opic/src/java/" />
+        <source path="${plugins.dir}/scoring-orphan/src/java"/>
+        <source path="${plugins.dir}/scoring-orphan/src/test"/>
         <source path="${plugins.dir}/scoring-similarity/src/java/" />
         <source path="${plugins.dir}/subcollection/src/java/" />
         <source path="${plugins.dir}/subcollection/src/test/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ed0bb985c..745588daf 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -538,6 +538,16 @@
 </property>
 
 <property>
+  <name>db.update.purge.orphans</name>
+  <value>false</value>
+  <description>If true, updatedb will permanently delete URL's marked
+  as orphan from the CrawlDb. The plugin scoring-orphan needs to be
+  activated to get records marked as orphan. See the plugin's options
+  elsewhere in this document.
+  </description>
+</property>
+
+<property>
     <name>db.url.normalizers</name>
     <value>false</value>
     <description>Normalize urls when updating crawldb</description>
@@ -1542,6 +1552,24 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
     </description>
 </property>
 
+<!-- scoring filter orphan properties -->
+
+<property>
+  <name>scoring.orphan.mark.gone.after</name>
+  <value>2592000</value>
+  <description>Time in seconds after which orphaned
+  pages are marked as gone. Default is 30 days.
+  </description>
+</property>
+
+<property>
+  <name>scoring.orphan.mark.orphan.after</name>
+  <value>3456000</value>
+  <description>Time in seconds after which orphaned
+  pages are marked as gone. Default is 40 days.
+  </description>
+</property>
+
 <!-- language-identifier plugin properties -->
 
 <property>
diff --git a/default.properties b/default.properties
index c1e310fd2..6b7a6ab79 100644
--- a/default.properties
+++ b/default.properties
@@ -129,6 +129,7 @@ plugins.scoring=\
    org.apache.nutch.scoring.depth*:\
    org.apache.nutch.scoring.link*:\
    org.apache.nutch.scoring.opic*:\
+   org.apache.nutch.scoring.orphan*:\
    org.apache.nutch.scoring.similarity*:\
    org.apache.nutch.scoring.tld*:\
    org.apache.nutch.scoring.urlmeta*
diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java 
b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index b9d4cab26..e54c791db 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -61,7 +61,10 @@
   public static final byte STATUS_DB_REDIR_PERM = 0x05;
   /** Page was successfully fetched and found not modified. */
   public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+  /** Page was marked as being a duplicate of another page */
   public static final byte STATUS_DB_DUPLICATE = 0x07;
+  /** Page was marked as orphan, e.g. has no inlinks anymore */
+  public static final byte STATUS_DB_ORPHAN = 0x08;
 
   /** Maximum value of DB-related status. */
   public static final byte STATUS_DB_MAX = 0x1f;
@@ -100,6 +103,7 @@
     statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
     statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
     statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
+    statNames.put(STATUS_DB_ORPHAN, "db_orphan");
     statNames.put(STATUS_SIGNATURE, "signature");
     statNames.put(STATUS_INJECTED, "injected");
     statNames.put(STATUS_LINKED, "linked");
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java 
b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 7ce4500a7..080b03730 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -51,6 +51,7 @@
   public static final String CRAWLDB_ADDITIONS_ALLOWED = 
"db.update.additions.allowed";
 
   public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
+  public static final String CRAWLDB_PURGE_ORPHANS = "db.update.purge.orphans";
 
   public static final String CURRENT_NAME = "current";
 
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java 
b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index 0b9cac376..7b2aa80a0 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -39,19 +39,15 @@
 public class CrawlDbFilter implements
     Mapper<Text, CrawlDatum, Text, CrawlDatum> {
   public static final String URL_FILTERING = "crawldb.url.filters";
-
   public static final String URL_NORMALIZING = "crawldb.url.normalizers";
-
   public static final String URL_NORMALIZING_SCOPE = 
"crawldb.url.normalizers.scope";
 
   private boolean urlFiltering;
-
   private boolean urlNormalizers;
 
   private boolean url404Purging;
-
+  private boolean purgeOrphans;
   private URLFilters filters;
-
   private URLNormalizers normalizers;
 
   private String scope;
@@ -63,6 +59,7 @@ public void configure(JobConf job) {
     urlFiltering = job.getBoolean(URL_FILTERING, false);
     urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
     url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
+    purgeOrphans = job.getBoolean(CrawlDb.CRAWLDB_PURGE_ORPHANS, false);
 
     if (urlFiltering) {
       filters = new URLFilters(job);
@@ -87,7 +84,16 @@ public void map(Text key, CrawlDatum value,
     // https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
     // cheaper than normalizing or filtering
     if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
-      url = null;
+      reporter.getCounter("CrawlDB filter",
+        "Gone records removed").increment(1);
+      return;
+    }
+    // Whether to remove orphaned pages
+    // https://issues.apache.org/jira/browse/NUTCH-1932
+    if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) {
+      reporter.getCounter("CrawlDB filter",
+        "Orphan records removed").increment(1);
+      return;
     }
     if (url != null && urlNormalizers) {
       try {
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
index 0aec451b7..cb04e638e 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -155,7 +155,16 @@ public void reduce(Text key, Iterator<CrawlDatum> values,
 
     // still no new data - record only unchanged old data, if exists, and 
return
     if (!fetchSet) {
-      if (oldSet) {// at this point at least "old" should be present
+      if (oldSet) { // at this point at least "old" should be present
+        // set score for orphaned pages (not fetched in the current cycle and
+        // with no inlinks)
+        try {
+          scfilters.orphanedScore(key, old);
+        } catch (ScoringFilterException e) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Couldn't update orphaned score, key={}: {}", key, e);
+          }
+        }
         output.collect(key, old);
         reporter.getCounter("CrawlDB status",
             CrawlDatum.getStatusName(old.getStatus())).increment(1);
@@ -312,7 +321,7 @@ public void reduce(Text key, Iterator<CrawlDatum> values,
       scfilters.updateDbScore(key, oldSet ? old : null, result, linkList);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Couldn't update score, key=" + key + ": " + e);
+        LOG.warn("Couldn't update score, key={}: {}", key, e);
       }
     }
     // remove generation time, if any
diff --git a/src/java/org/apache/nutch/scoring/ScoringFilter.java 
b/src/java/org/apache/nutch/scoring/ScoringFilter.java
index 4061a750b..c1acc482f 100644
--- a/src/java/org/apache/nutch/scoring/ScoringFilter.java
+++ b/src/java/org/apache/nutch/scoring/ScoringFilter.java
@@ -164,7 +164,7 @@ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
    *          newly discovered page. If not null, filters should use score
    *          values from this parameter as the starting values - the
    *          <code>datum</code> parameter may contain values that are no 
longer
-   *          valid, if other updates occured between generation and this
+   *          valid, if other updates occurred between generation and this
    *          update.
    * @param datum
    *          the new datum, with the original score saved at the time when
@@ -179,6 +179,20 @@ public void updateDbScore(Text url, CrawlDatum old, 
CrawlDatum datum,
       List<CrawlDatum> inlinked) throws ScoringFilterException;
 
   /**
+   * This method may change the score or status of CrawlDatum during CrawlDb
+   * update, when the URL is neither fetched nor has any inlinks.
+   *
+   * @param url
+   *          URL of the page
+   * @param datum
+   *          CrawlDatum for page
+   * @throws ScoringFilterException
+   */
+  public default void orphanedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  /**
    * This method calculates a Lucene document boost.
    * 
    * @param url
diff --git a/src/java/org/apache/nutch/scoring/ScoringFilters.java 
b/src/java/org/apache/nutch/scoring/ScoringFilters.java
index 5bad78f60..f9d2f1b87 100644
--- a/src/java/org/apache/nutch/scoring/ScoringFilters.java
+++ b/src/java/org/apache/nutch/scoring/ScoringFilters.java
@@ -81,6 +81,14 @@ public void updateDbScore(Text url, CrawlDatum old, 
CrawlDatum datum,
     }
   }
 
+  /** Calculate orphaned page score during CrawlDb.update(). */
+  public void orphanedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].orphanedScore(url, datum);
+    }
+  }
+
   public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content 
content)
       throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 769debcda..5402d036c 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -75,6 +75,7 @@
     <ant dir="scoring-depth" target="deploy"/>
     <ant dir="scoring-link" target="deploy"/>
     <ant dir="scoring-opic" target="deploy"/>
+    <ant dir="scoring-orphan" target="deploy"/>
     <ant dir="scoring-similarity" target="deploy"/>
     <ant dir="subcollection" target="deploy"/>
     <ant dir="tld" target="deploy"/>
@@ -127,6 +128,7 @@
      <ant dir="protocol-file" target="test"/>
      <ant dir="protocol-http" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
+     <ant dir="scoring-orphan" target="test"/>
      <ant dir="subcollection" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
      <ant dir="urlfilter-domain" target="test"/>
@@ -198,6 +200,7 @@
     <ant dir="scoring-depth" target="clean"/>
     <ant dir="scoring-link" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>
+    <ant dir="scoring-orphan" target="clean"/>
     <ant dir="scoring-similarity" target="clean"/>
     <ant dir="subcollection" target="clean"/>
     <ant dir="tld" target="clean"/>
diff --git a/src/plugin/scoring-orphan/build.xml 
b/src/plugin/scoring-orphan/build.xml
new file mode 100644
index 000000000..e0ddd965d
--- /dev/null
+++ b/src/plugin/scoring-orphan/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-orphan" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>
diff --git a/src/plugin/scoring-orphan/ivy.xml 
b/src/plugin/scoring-orphan/ivy.xml
new file mode 100644
index 000000000..1a86d6803
--- /dev/null
+++ b/src/plugin/scoring-orphan/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/scoring-orphan/plugin.xml 
b/src/plugin/scoring-orphan/plugin.xml
new file mode 100644
index 000000000..061445175
--- /dev/null
+++ b/src/plugin/scoring-orphan/plugin.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-orphan"
+   name="Orphan Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.cc.org">
+
+   <runtime>
+      <library name="scoring-orphan.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.scoring.orphan"
+              name="OrphanScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.orphan.OrphanScoringFilter"
+        class="org.apache.nutch.scoring.orphan.OrphanScoringFilter" />
+   </extension>
+
+</plugin>
\ No newline at end of file
diff --git 
a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
 
b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
new file mode 100644
index 000000000..24cc36625
--- /dev/null
+++ 
b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.orphan;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Orphan scoring filter that determines whether a page has become orphaned,
+ * e.g. it has no more other pages linking to it. If a page hasn't been linked
+ * to after markGoneAfter seconds, the page is marked as gone and is then
+ * removed by an indexer. If a page hasn't been linked to after markOrphanAfter
+ * seconds, the page is removed from the CrawlDB.
+ */
+public class OrphanScoringFilter extends AbstractScoringFilter {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(OrphanScoringFilter.class);
+
+  public static Text ORPHAN_KEY_WRITABLE = new Text("_orphan_");
+
+  private Configuration conf;
+  private static int DEFAULT_GONE_TIME = 30 * 24 * 60 * 60;
+  private static int DEFAULT_ORPHAN_TIME = 40 * 24 * 60 * 60;
+
+  private long markGoneAfter = DEFAULT_GONE_TIME;
+  private long markOrphanAfter = DEFAULT_ORPHAN_TIME;
+
+  public void setConf(Configuration conf) {
+    markGoneAfter = conf.getInt("scoring.orphan.mark.gone.after",
+        DEFAULT_GONE_TIME);
+    markOrphanAfter = conf.getInt("scoring.orphan.mark.orphan.after",
+        DEFAULT_ORPHAN_TIME);
+    if (markGoneAfter > markOrphanAfter) {
+      LOG.warn("OrphanScoringFilter: the time span after which pages are 
marked"
+          + " as gone is larger than that to mark pages as orphaned"
+          + " (scoring.orphan.mark.gone.after > 
scoring.orphan.mark.orphan.after):"
+          + " This disables marking pages as gone.");
+    }
+  }
+
+  /**
+   * Used for orphan control.
+   *
+   * @param Text url of the record
+   * @param CrawlDatum old CrawlDatum
+   * @param CrawlDatum new CrawlDatum
+   * @param List<CrawlDatum> list of inlinked CrawlDatums
+   * @return void
+   */
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinks) throws ScoringFilterException {
+
+    int now = (int)(System.currentTimeMillis() / 1000);
+
+    // Are there inlinks for this record?
+    if (inlinks.size() > 0) {
+      // Set the last time we have seen this link to NOW
+      datum.getMetaData().put(ORPHAN_KEY_WRITABLE,
+          new IntWritable(now));
+    } else {
+      orphanedScore(url, datum);
+    }
+  }
+
+  public void orphanedScore(Text url, CrawlDatum datum) {
+    // Already has an orphaned time?
+    if (datum.getMetaData().containsKey(ORPHAN_KEY_WRITABLE)) {
+      // Get the last time this hyperlink was inlinked
+      IntWritable writable = (IntWritable)datum.getMetaData()
+          .get(ORPHAN_KEY_WRITABLE);
+      int lastInlinkTime = writable.get();
+      int now = (int) (System.currentTimeMillis() / 1000);
+      int elapsedSinceLastInLinkTime = now - lastInlinkTime;
+
+      if (elapsedSinceLastInLinkTime > markOrphanAfter) {
+        // Mark as orphan so we can permanently delete it
+        datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN);
+      } else if (elapsedSinceLastInLinkTime > markGoneAfter) {
+        // Mark as gone so the indexer can remove it
+        datum.setStatus(CrawlDatum.STATUS_DB_GONE);
+      }
+    }
+  }
+
+}
diff --git 
a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/package-info.java
 
b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/package-info.java
new file mode 100644
index 000000000..7a2017fb6
--- /dev/null
+++ 
b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter to modify score or status of orphaned pages (no inlinks found
+ * for a configurable amount of time).
+ */
+package org.apache.nutch.scoring.orphan;
+
diff --git 
a/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
 
b/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
new file mode 100644
index 000000000..1fb7b5ad5
--- /dev/null
+++ 
b/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.orphan;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+
+public class TestOrphanScoringFilter {
+
+  @Test
+  public void testOrphanScoringFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    conf.setInt("scoring.orphan.mark.gone.after", 5);
+    conf.setInt("scoring.orphan.mark.orphan.after", 10);
+
+    ScoringFilter filter = new OrphanScoringFilter();
+    filter.setConf(conf);
+
+    Text url = new Text("http://nutch.apache.org/";);
+    CrawlDatum datum = new CrawlDatum();
+    datum.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);
+
+    List<CrawlDatum> emptyListOfInlinks = new ArrayList<CrawlDatum>();
+    List<CrawlDatum> populatedListOfInlinks = new ArrayList<CrawlDatum>();
+    populatedListOfInlinks.add(datum);
+
+    // Act as if record has inlinks
+    filter.updateDbScore(url, null, datum, populatedListOfInlinks);
+    int firstOrphanTime = getTime(datum);
+    assertTrue(datum.getMetaData()
+        .containsKey(OrphanScoringFilter.ORPHAN_KEY_WRITABLE));
+
+    // Wait a little bit
+    try {
+      Thread.sleep(1000);
+    } catch (Exception e) {
+    }
+
+    // Again, this time orphan time must be increased by about 1000 ms
+    filter.updateDbScore(url, null, datum, populatedListOfInlinks);
+    int secondOrphanTime = getTime(datum);
+    assertTrue(secondOrphanTime > firstOrphanTime);
+
+    // Act as if no more inlinks, time will not increase, status is still the
+    // same
+    filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+    int thirdOrphanTime = getTime(datum);
+    assertEquals(thirdOrphanTime, secondOrphanTime);
+    assertEquals(
+        "Expected status db_notmodified but got "
+            + CrawlDatum.getStatusName(datum.getStatus()),
+        CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
+
+    // Wait a little bit
+    try {
+      Thread.sleep(1000);
+    } catch (Exception e) {
+    }
+
+    // Act as if no more inlinks, time will not increase, status is still the
+    // same
+    filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+    assertEquals(
+        "Expected status db_notmodified but got "
+            + CrawlDatum.getStatusName(datum.getStatus()),
+        CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
+
+    // Wait until mark.gone.after
+    try {
+      Thread.sleep(5000);
+    } catch (Exception e) {
+    }
+
+    // Again, but now markgoneafter has expired and record should be DB_GONE
+    filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+    int fourthOrphanTime = getTime(datum);
+    assertEquals(fourthOrphanTime, thirdOrphanTime);
+    assertEquals(
+        "Expected status db_gone but got "
+            + CrawlDatum.getStatusName(datum.getStatus()),
+        CrawlDatum.STATUS_DB_GONE, datum.getStatus());
+
+    // Wait until mark.orphan.after
+    try {
+      Thread.sleep(5000);
+    } catch (Exception e) {
+    }
+
+    // Again, but now markgoneafter has expired and record should be DB_ORPHAN
+    filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+    assertEquals(
+        "Expected status db_orphan but got "
+            + CrawlDatum.getStatusName(datum.getStatus()),
+        CrawlDatum.STATUS_DB_ORPHAN, datum.getStatus());
+  }
+
+  protected int getTime(CrawlDatum datum) {
+    IntWritable writable = (IntWritable) datum.getMetaData()
+        .get(OrphanScoringFilter.ORPHAN_KEY_WRITABLE);
+    return writable.get();
+  }
+}
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java 
b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
index b61a21cfb..01b0d8175 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -64,14 +64,16 @@
       .getLogger(MethodHandles.lookup().lookupClass());
 
   protected static final byte[][] fetchDbStatusPairs = {
-      { -1, STATUS_DB_UNFETCHED }, { STATUS_FETCH_SUCCESS, STATUS_DB_FETCHED },
+      { -1, STATUS_DB_UNFETCHED }, // no fetch status counter-part
+      { STATUS_FETCH_SUCCESS, STATUS_DB_FETCHED },
       { STATUS_FETCH_GONE, STATUS_DB_GONE },
       { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
       { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM },
       { STATUS_FETCH_NOTMODIFIED, STATUS_DB_NOTMODIFIED },
-      { STATUS_FETCH_RETRY, -1 }, // fetch_retry does not have a CrawlDb
-                                  // counter-part
-      { -1, STATUS_DB_DUPLICATE }, };
+      // fetch_retry does not have a CrawlDb counter-part
+      { STATUS_FETCH_RETRY, -1 },
+      // no fetch status counter-part for duplicates and orphans
+      { -1, STATUS_DB_DUPLICATE }, { -1, STATUS_DB_ORPHAN } };
 
   /** tested {@link FetchSchedule} implementations */
   protected String[] schedules = { "DefaultFetchSchedule",


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Automatically remove orphaned pages
> -----------------------------------
>
>                 Key: NUTCH-1932
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1932
>             Project: Nutch
>          Issue Type: New Feature
>    Affects Versions: 1.13
>            Reporter: Markus Jelsma
>            Assignee: Markus Jelsma
>            Priority: Minor
>             Fix For: 1.14
>
>         Attachments: NUTCH-1932-add.patch, NUTCH-1932.patch, 
> NUTCH-1932.patch, NUTCH-1932.patch, NUTCH-1932.patch, NUTCH-1932.patch, 
> NUTCH-1932.patch, NUTCH-1932.patch, NUTCH-1932.patch, NUTCH-1932.patch, 
> NUTCH-1932.patch, NUTCH-1932.patch, NUTCH-1932.patch, NUTCH-1932.patch, 
> NUTCH-1932.patch, NUTCH-1932.patch
>
>
> Orphan scoring filter that determines whether a page has become orphaned, 
> e.g. it has no more other pages linking to it. If a page hasn't been linked 
> to after markGoneAfter seconds, the page is marked as gone and is then 
> removed by an indexer.  If a page hasn't been linked to after markOrphanAfter 
> seconds, the page is removed from the CrawlDB.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to