[1/2] nutch git commit: NUTCH-2222 re-fetch deletes all metadata except _csh_ and _rs_

lewismc Thu, 07 Apr 2016 08:16:20 -0700

Repository: nutch
Updated Branches:
  refs/heads/2.x 1e65c3f6b -> d868f06cf



NUTCH-2222 re-fetch deletes all metadata except _csh_ and _rs_


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/fd478448
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/fd478448
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/fd478448

Branch: refs/heads/2.x
Commit: fd478448bb4a68ad53520ae7b325204a7834782a
Parents: 3e80673
Author: Lewis John McGibbney <[email protected]>
Authored: Mon Mar 21 20:46:18 2016 -0700
Committer: Lewis John McGibbney <[email protected]>
Committed: Mon Mar 21 20:46:18 2016 -0700

----------------------------------------------------------------------
 .../org/apache/nutch/crawl/GeneratorJob.java    | 14 +++-
 src/test/nutch-site.xml                         |  7 ++
 .../org/apache/nutch/fetcher/TestFetcher.java   | 84 ++++++++++++++++++--
 3 files changed, 94 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/java/org/apache/nutch/crawl/GeneratorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java 
b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index aae2ba9..e06a192 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java
@@ -240,9 +240,17 @@ public class GeneratorJob extends NutchTool implements 
Tool {
 
   /**
    * Mark URLs ready for fetching.
-   * 
-   * @throws ClassNotFoundException
-   * @throws InterruptedException
+   * @param topN
+   *          top threshold for maximum number of URLs permitted in a batch
+   * @param curTime
+   *          the current time in milliseconds
+   * @param filter
+   *          optional filtering of URLs within the generated batch
+   * @param norm
+   *          optional normalization of URls within the generated batch
+   * @param sitemap
+   *          flag indicating whether a URL is a sitemap and hence processed 
accordingly
+   * @throws Exception
    * */
   public String generate(long topN, long curTime, boolean filter, boolean norm,
       boolean sitemap) throws Exception {

http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/test/nutch-site.xml
----------------------------------------------------------------------
diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml
index 4f3ced4..e599547 100644
--- a/src/test/nutch-site.xml
+++ b/src/test/nutch-site.xml
@@ -22,4 +22,11 @@
   <description>Default in-memory datastore class for temp test 
data.</description>
 </property>
 
+<property>
+  <name>db.fetch.interval.default</name>
+  <value>1</value>
+  <description>The default number of seconds between re-fetches of a page (30 
days).
+  </description>
+</property>
+
 </configuration>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/test/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java 
b/src/test/org/apache/nutch/fetcher/TestFetcher.java
index 2411a61..8a8fa42 100644
--- a/src/test/org/apache/nutch/fetcher/TestFetcher.java
+++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java
@@ -23,9 +23,12 @@ import java.util.List;
 import java.util.Map;
 
 import org.apache.hadoop.fs.Path;
+import org.apache.nutch.crawl.DbUpdaterJob;
 import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.InjectorJob;
 import org.apache.nutch.crawl.URLWebPage;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParserJob;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.storage.Mark;
@@ -34,22 +37,24 @@ import org.apache.nutch.util.AbstractNutchTest;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.CrawlTestUtil;
 import org.mortbay.jetty.Server;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import crawlercommons.robots.BaseRobotRules;
 
 import org.junit.After;
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Test;
 import static org.junit.Assert.*;
 
 /**
- * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
- * Verify contents
- * 
+ * Various fetcher tests which test fetching, refetching, sitemap fetching
+ * sitemap detection and the basic verification of a agent name check. 
  */
 public class TestFetcher extends AbstractNutchTest {
 
+  private static final Logger LOG = 
LoggerFactory.getLogger(AbstractNutchTest.class);
+
   final static Path testdir = new Path("build/test/fetch-test");
   Path urlPath;
   Server server;
@@ -58,6 +63,7 @@ public class TestFetcher extends AbstractNutchTest {
   @Before
   public void setUp() throws Exception {
     super.setUp();
+    conf.setBoolean(FetcherJob.PARSE_KEY, true);
     urlPath = new Path(testdir, "urls");
     server = CrawlTestUtil.getServer(conf.getInt("content.server.port", 50000),
         "build/test/data/fetch-test-site");
@@ -117,7 +123,6 @@ public class TestFetcher extends AbstractNutchTest {
 
     // fetch
     time = System.currentTimeMillis();
-    conf.setBoolean(FetcherJob.PARSE_KEY, true);
     FetcherJob fetcher = new FetcherJob(conf);
     fetcher.fetch(batchId, 1, false, -1);
 
@@ -154,6 +159,68 @@ public class TestFetcher extends AbstractNutchTest {
   }
 
   /**
+   * Tests a refetch of a URL. This process consists of two consecutive
+   * inject, generate, fetch, parse then update cycles. The test configuration
+   * is defined such that <code>db.fetch.interval.default</code> is set to 
+   * a very low value (indicating that the URL should be fetched again 
immediately).
+   * In addition, configuration tests that relevant 
+   * {@link org.apache.nutch.metadata.Metadata} is present and the values 
consistent 
+   * and therefore not overwritten.
+   * @see https://issues.apache.org/jira/browse/NUTCH-2222
+   * @throws Exception
+   */
+  @Test
+  public void testReFetch() throws Exception {
+
+    // generate seedlist
+    ArrayList<String> urls = new ArrayList<String>();
+    // inject
+    addUrl(urls, "index.html");
+    CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+
+    InjectorJob injector = new InjectorJob(conf);
+    injector.inject(urlPath);
+
+    // crawl 1 
+    long time = System.currentTimeMillis();
+    GeneratorJob g = new GeneratorJob(conf);
+    String batchId = g.generate(Long.MAX_VALUE, time, false, false, false);
+    FetcherJob fetcher = new FetcherJob(conf);
+    fetcher.fetch(Nutch.ALL_BATCH_ID_STR, 1, false, -1);
+    ParserJob parser = new ParserJob(conf);
+    parser.parse(Nutch.ALL_BATCH_ID_STR, true, true);
+    URLWebPage up = CrawlTestUtil.readContents(webPageStore, Mark.FETCH_MARK, 
(String[]) null).get(0);
+    assertEquals(urls.size(), 1);
+    int countMetaDatasFetch1 = up.getDatum().getMetadata().size();
+    DbUpdaterJob updateter = new DbUpdaterJob(conf);
+    updateter.run(new String[]{Nutch.ALL_BATCH_ID_STR});
+
+
+    Thread.sleep(10000);
+
+    // crawl 2
+    CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+    injector = new InjectorJob(conf);
+    injector.inject(urlPath);
+    g = new GeneratorJob(conf);
+    time = System.currentTimeMillis();
+    batchId = g.generate(Long.MAX_VALUE, time, false, false, false); 
+    fetcher = new FetcherJob(conf);
+    fetcher.fetch(Nutch.ALL_BATCH_ID_STR, 1, false, -1);
+    parser = new ParserJob(conf);
+    parser.parse(Nutch.ALL_BATCH_ID_STR, true, true);
+    updateter = new DbUpdaterJob(conf);
+    updateter.run(new String[]{Nutch.ALL_BATCH_ID_STR});
+    up = CrawlTestUtil.readContents(webPageStore, null, (String[]) 
null).get(0);
+    assertEquals(urls.size(), 1);
+    int countMetaDatasFetch2 = up.getDatum().getMetadata().size();
+
+    LOG.info("countMetaDatas Fetch1 : {}",  countMetaDatasFetch1);
+    LOG.info("countMetaDatas Fetch2 : {}",  countMetaDatasFetch2);
+    assertEquals(countMetaDatasFetch1, countMetaDatasFetch2);
+  }
+
+  /**
    * Test that only sitemap page fetcher
    *
    * @throws Exception
@@ -201,7 +268,6 @@ public class TestFetcher extends AbstractNutchTest {
     //    generate for only sitemap
     g.generate(Long.MAX_VALUE, time, false, false, true);
 
-    conf.setBoolean(FetcherJob.PARSE_KEY, true);
     FetcherJob fetcher = new FetcherJob(conf);
 
     // for only sitemap fetch
@@ -265,7 +331,6 @@ public class TestFetcher extends AbstractNutchTest {
 
     g.generate(Long.MAX_VALUE, time, false, false, false);
 
-    conf.setBoolean(FetcherJob.PARSE_KEY, true);
     FetcherJob fetcher = new FetcherJob(conf);
 
     // for only sitemap fetch
@@ -287,6 +352,10 @@ public class TestFetcher extends AbstractNutchTest {
     }
   }
 
+  /** 
+   * Maps a webpage to the local Jetty server address so that it can 
+   * be fetched as part of an arraylist
+   */
   private void addUrl(ArrayList<String> urls, String page) {
     urls.add("http://127.0.0.1:"; + server.getConnectors()[0].getPort() + "/"
         + page);
@@ -299,7 +368,6 @@ public class TestFetcher extends AbstractNutchTest {
     conf.set("http.agent.name", "");
 
     try {
-      conf.setBoolean(FetcherJob.PARSE_KEY, true);
       FetcherJob fetcher = new FetcherJob(conf);
       fetcher.checkConfiguration();
     } catch (IllegalArgumentException iae) {

[1/2] nutch git commit: NUTCH-2222 re-fetch deletes all metadata except _csh_ and _rs_

Reply via email to