Repository: nutch Updated Branches: refs/heads/2.x 1e65c3f6b -> d868f06cf
NUTCH-2222 re-fetch deletes all metadata except _csh_ and _rs_ Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/fd478448 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/fd478448 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/fd478448 Branch: refs/heads/2.x Commit: fd478448bb4a68ad53520ae7b325204a7834782a Parents: 3e80673 Author: Lewis John McGibbney <[email protected]> Authored: Mon Mar 21 20:46:18 2016 -0700 Committer: Lewis John McGibbney <[email protected]> Committed: Mon Mar 21 20:46:18 2016 -0700 ---------------------------------------------------------------------- .../org/apache/nutch/crawl/GeneratorJob.java | 14 +++- src/test/nutch-site.xml | 7 ++ .../org/apache/nutch/fetcher/TestFetcher.java | 84 ++++++++++++++++++-- 3 files changed, 94 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/java/org/apache/nutch/crawl/GeneratorJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java index aae2ba9..e06a192 100644 --- a/src/java/org/apache/nutch/crawl/GeneratorJob.java +++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java @@ -240,9 +240,17 @@ public class GeneratorJob extends NutchTool implements Tool { /** * Mark URLs ready for fetching. - * - * @throws ClassNotFoundException - * @throws InterruptedException + * @param topN + * top threshold for maximum number of URLs permitted in a batch + * @param curTime + * the current time in milliseconds + * @param filter + * optional filtering of URLs within the generated batch + * @param norm + * optional normalization of URls within the generated batch + * @param sitemap + * flag indicating whether a URL is a sitemap and hence processed accordingly + * @throws Exception * */ public String generate(long topN, long curTime, boolean filter, boolean norm, boolean sitemap) throws Exception { http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/test/nutch-site.xml ---------------------------------------------------------------------- diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml index 4f3ced4..e599547 100644 --- a/src/test/nutch-site.xml +++ b/src/test/nutch-site.xml @@ -22,4 +22,11 @@ <description>Default in-memory datastore class for temp test data.</description> </property> +<property> + <name>db.fetch.interval.default</name> + <value>1</value> + <description>The default number of seconds between re-fetches of a page (30 days). + </description> +</property> + </configuration> http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/test/org/apache/nutch/fetcher/TestFetcher.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java b/src/test/org/apache/nutch/fetcher/TestFetcher.java index 2411a61..8a8fa42 100644 --- a/src/test/org/apache/nutch/fetcher/TestFetcher.java +++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java @@ -23,9 +23,12 @@ import java.util.List; import java.util.Map; import org.apache.hadoop.fs.Path; +import org.apache.nutch.crawl.DbUpdaterJob; import org.apache.nutch.crawl.GeneratorJob; import org.apache.nutch.crawl.InjectorJob; import org.apache.nutch.crawl.URLWebPage; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.ParserJob; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.storage.Mark; @@ -34,22 +37,24 @@ import org.apache.nutch.util.AbstractNutchTest; import org.apache.nutch.util.Bytes; import org.apache.nutch.util.CrawlTestUtil; import org.mortbay.jetty.Server; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import crawlercommons.robots.BaseRobotRules; import org.junit.After; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import static org.junit.Assert.*; /** - * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4. - * Verify contents - * + * Various fetcher tests which test fetching, refetching, sitemap fetching + * sitemap detection and the basic verification of a agent name check. */ public class TestFetcher extends AbstractNutchTest { + private static final Logger LOG = LoggerFactory.getLogger(AbstractNutchTest.class); + final static Path testdir = new Path("build/test/fetch-test"); Path urlPath; Server server; @@ -58,6 +63,7 @@ public class TestFetcher extends AbstractNutchTest { @Before public void setUp() throws Exception { super.setUp(); + conf.setBoolean(FetcherJob.PARSE_KEY, true); urlPath = new Path(testdir, "urls"); server = CrawlTestUtil.getServer(conf.getInt("content.server.port", 50000), "build/test/data/fetch-test-site"); @@ -117,7 +123,6 @@ public class TestFetcher extends AbstractNutchTest { // fetch time = System.currentTimeMillis(); - conf.setBoolean(FetcherJob.PARSE_KEY, true); FetcherJob fetcher = new FetcherJob(conf); fetcher.fetch(batchId, 1, false, -1); @@ -154,6 +159,68 @@ public class TestFetcher extends AbstractNutchTest { } /** + * Tests a refetch of a URL. This process consists of two consecutive + * inject, generate, fetch, parse then update cycles. The test configuration + * is defined such that <code>db.fetch.interval.default</code> is set to + * a very low value (indicating that the URL should be fetched again immediately). + * In addition, configuration tests that relevant + * {@link org.apache.nutch.metadata.Metadata} is present and the values consistent + * and therefore not overwritten. + * @see https://issues.apache.org/jira/browse/NUTCH-2222 + * @throws Exception + */ + @Test + public void testReFetch() throws Exception { + + // generate seedlist + ArrayList<String> urls = new ArrayList<String>(); + // inject + addUrl(urls, "index.html"); + CrawlTestUtil.generateSeedList(fs, urlPath, urls); + + InjectorJob injector = new InjectorJob(conf); + injector.inject(urlPath); + + // crawl 1 + long time = System.currentTimeMillis(); + GeneratorJob g = new GeneratorJob(conf); + String batchId = g.generate(Long.MAX_VALUE, time, false, false, false); + FetcherJob fetcher = new FetcherJob(conf); + fetcher.fetch(Nutch.ALL_BATCH_ID_STR, 1, false, -1); + ParserJob parser = new ParserJob(conf); + parser.parse(Nutch.ALL_BATCH_ID_STR, true, true); + URLWebPage up = CrawlTestUtil.readContents(webPageStore, Mark.FETCH_MARK, (String[]) null).get(0); + assertEquals(urls.size(), 1); + int countMetaDatasFetch1 = up.getDatum().getMetadata().size(); + DbUpdaterJob updateter = new DbUpdaterJob(conf); + updateter.run(new String[]{Nutch.ALL_BATCH_ID_STR}); + + + Thread.sleep(10000); + + // crawl 2 + CrawlTestUtil.generateSeedList(fs, urlPath, urls); + injector = new InjectorJob(conf); + injector.inject(urlPath); + g = new GeneratorJob(conf); + time = System.currentTimeMillis(); + batchId = g.generate(Long.MAX_VALUE, time, false, false, false); + fetcher = new FetcherJob(conf); + fetcher.fetch(Nutch.ALL_BATCH_ID_STR, 1, false, -1); + parser = new ParserJob(conf); + parser.parse(Nutch.ALL_BATCH_ID_STR, true, true); + updateter = new DbUpdaterJob(conf); + updateter.run(new String[]{Nutch.ALL_BATCH_ID_STR}); + up = CrawlTestUtil.readContents(webPageStore, null, (String[]) null).get(0); + assertEquals(urls.size(), 1); + int countMetaDatasFetch2 = up.getDatum().getMetadata().size(); + + LOG.info("countMetaDatas Fetch1 : {}", countMetaDatasFetch1); + LOG.info("countMetaDatas Fetch2 : {}", countMetaDatasFetch2); + assertEquals(countMetaDatasFetch1, countMetaDatasFetch2); + } + + /** * Test that only sitemap page fetcher * * @throws Exception @@ -201,7 +268,6 @@ public class TestFetcher extends AbstractNutchTest { // generate for only sitemap g.generate(Long.MAX_VALUE, time, false, false, true); - conf.setBoolean(FetcherJob.PARSE_KEY, true); FetcherJob fetcher = new FetcherJob(conf); // for only sitemap fetch @@ -265,7 +331,6 @@ public class TestFetcher extends AbstractNutchTest { g.generate(Long.MAX_VALUE, time, false, false, false); - conf.setBoolean(FetcherJob.PARSE_KEY, true); FetcherJob fetcher = new FetcherJob(conf); // for only sitemap fetch @@ -287,6 +352,10 @@ public class TestFetcher extends AbstractNutchTest { } } + /** + * Maps a webpage to the local Jetty server address so that it can + * be fetched as part of an arraylist + */ private void addUrl(ArrayList<String> urls, String page) { urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/" + page); @@ -299,7 +368,6 @@ public class TestFetcher extends AbstractNutchTest { conf.set("http.agent.name", ""); try { - conf.setBoolean(FetcherJob.PARSE_KEY, true); FetcherJob fetcher = new FetcherJob(conf); fetcher.checkConfiguration(); } catch (IllegalArgumentException iae) {
