Author: siren Date: Sat Sep 2 08:44:28 2006 New Revision: 439610 URL: http://svn.apache.org/viewvc?rev=439610&view=rev Log: add simple junit test for fetcher
Added: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java lucene/nutch/trunk/src/testresources/ lucene/nutch/trunk/src/testresources/fetch-test-site/ lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html lucene/nutch/trunk/src/testresources/fetch-test-site/index.html lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt Modified: lucene/nutch/trunk/src/test/crawl-tests.xml lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Modified: lucene/nutch/trunk/src/test/crawl-tests.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=439610&r1=439609&r2=439610&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/crawl-tests.xml (original) +++ lucene/nutch/trunk/src/test/crawl-tests.xml Sat Sep 2 08:44:28 2006 @@ -6,8 +6,33 @@ <property> <name>plugin.includes</name> - <value>urlfilter-suffix|scoring-opic</value> + <value>parse-html|protocol-http|urlfilter-suffix|scoring-opic</value> <description>Enable required plugins.</description> +</property> + +<property> + <name>content.server.port</name> + <value>55000</value> + <description>Port of http server serving content.</description> +</property> + +<property> + <name>fetcher.server.delay</name> + <value>1.0</value> + <description>The number of seconds the fetcher will delay between + successive requests to the same server.</description> +</property> + +<property> + <name>fetcher.server.delay</name> + <value>1.0</value> + <description>The number of seconds the fetcher will delay between + successive requests to the same server.</description> +</property> + +<property> + <name>http.agent.name</name> + <value>test-nutch</value> </property> </configuration> Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=439610&r1=439609&r2=439610&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Sat Sep 2 08:44:28 2006 @@ -15,16 +15,24 @@ */ package org.apache.nutch.crawl; +import java.io.File; +import java.io.IOException; +import java.net.UnknownHostException; import java.util.Iterator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.UTF8; +import org.mortbay.http.HttpContext; +import org.mortbay.http.SocketListener; +import org.mortbay.http.handler.ResourceHandler; +import org.mortbay.jetty.Server; public class CrawlDBTestUtil { @@ -62,8 +70,20 @@ * set values. * * @return + * @deprecated Use [EMAIL PROTECTED] #createConfiguration()} instead */ public static Configuration create(){ + return createConfiguration(); + } + + /** + * For now we need to manually construct our Configuration, because we need to + * override the default one and it is currently not possible to use dynamically + * set values. + * + * @return + */ + public static Configuration createConfiguration(){ Configuration conf=new Configuration(); conf.addDefaultResource("nutch-default.xml"); conf.addFinalResource("crawl-tests.xml"); @@ -80,5 +100,45 @@ this.url = url; this.datum = datum; } + } + + /** + * Generate seedlist + * @throws IOException + */ + public static void generateSeedList(FileSystem fs, Path urlPath, List<String> contents) throws IOException{ + FSDataOutputStream out; + Path file=new Path(urlPath,"urls.txt"); + fs.mkdirs(urlPath); + out=fs.create(file); + Iterator<String> iterator=contents.iterator(); + while(iterator.hasNext()){ + String url=iterator.next(); + out.writeBytes(url); + out.writeBytes("\n"); + } + out.flush(); + out.close(); + } + + /** + * Creates a new JettyServer with one static root context + * + * @param port port to listen to + * @param staticContent folder where static content lives + * @throws UnknownHostException + */ + public static Server getServer(int port, String staticContent) throws UnknownHostException{ + Server webServer = new org.mortbay.jetty.Server(); + SocketListener listener = new SocketListener(); + listener.setPort(port); + listener.setHost("127.0.0.1"); + webServer.addListener(listener); + HttpContext staticContext = new HttpContext(); + staticContext.setContextPath("/"); + staticContext.setResourceBase(staticContent); + staticContext.addHandler(new ResourceHandler()); + webServer.addContext(staticContext); + return webServer; } } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=439610&r1=439609&r2=439610&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sat Sep 2 08:44:28 2006 @@ -50,7 +50,7 @@ FileSystem fs; protected void setUp() throws Exception { - conf = CrawlDBTestUtil.create(); + conf = CrawlDBTestUtil.createConfiguration(); } protected void tearDown() { Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=439610&r1=439609&r2=439610&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Sat Sep 2 08:44:28 2006 @@ -18,11 +18,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; -import java.util.Iterator; import java.util.List; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; @@ -42,7 +40,6 @@ */ public class TestInjector extends TestCase { - private FSDataOutputStream out; private Configuration conf; private FileSystem fs; final static Path testdir=new Path("build/test/inject-test"); @@ -50,7 +47,7 @@ Path urlPath; protected void setUp() throws Exception { - conf = CrawlDBTestUtil.create(); + conf = CrawlDBTestUtil.createConfiguration(); urlPath=new Path(testdir,"urls"); crawldbPath=new Path(testdir,"crawldb"); fs=FileSystem.get(conf); @@ -66,7 +63,7 @@ for(int i=0;i<100;i++) { urls.add("http://zzz/" + i + ".html"); } - generateSeedList(urls); + CrawlDBTestUtil.generateSeedList(fs, urlPath, urls); Injector injector=new Injector(conf); injector.inject(crawldbPath, urlPath); @@ -87,7 +84,7 @@ for(int i=0;i<100;i++) { urls2.add("http://xxx/" + i + ".html"); } - generateSeedList(urls2); + CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2); injector.inject(crawldbPath, urlPath); urls.addAll(urls2); @@ -103,24 +100,6 @@ assertTrue(read.containsAll(urls)); assertTrue(urls.containsAll(read)); - } - - /** - * Generate seedlist - * @throws IOException - */ - private void generateSeedList(List<String> contents) throws IOException{ - Path file=new Path(urlPath,"urls.txt"); - fs.mkdirs(urlPath); - out=fs.create(file); - Iterator<String> iterator=contents.iterator(); - while(iterator.hasNext()){ - String url=iterator.next(); - out.writeBytes(url); - out.writeBytes("\n"); - } - out.flush(); - out.close(); } private List<String> readCrawldb() throws IOException{ Added: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=439610&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sat Sep 2 08:44:28 2006 @@ -0,0 +1,117 @@ +package org.apache.nutch.fetcher; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.UTF8; +import org.apache.nutch.crawl.CrawlDBTestUtil; +import org.apache.nutch.crawl.Generator; +import org.apache.nutch.crawl.Injector; +import org.apache.nutch.protocol.Content; +import org.mortbay.jetty.Server; + +import junit.framework.TestCase; + +/** + * Basic fetcher test + * 1. generate seedlist + * 2. inject + * 3. generate + * 3. fetch + * 4. Verify contents + * @author nutch-dev <nutch-dev at lucene.apache.org> + * + */ +public class TestFetcher extends TestCase { + + final static Path testdir=new Path("build/test/fetch-test"); + Configuration conf; + FileSystem fs; + Path crawldbPath; + Path segmentsPath; + Path urlPath; + Server server; + + protected void setUp() throws Exception{ + conf=CrawlDBTestUtil.createConfiguration(); + fs=FileSystem.get(conf); + fs.delete(testdir); + urlPath=new Path(testdir,"urls"); + crawldbPath=new Path(testdir,"crawldb"); + segmentsPath=new Path(testdir,"segments"); + server=CrawlDBTestUtil.getServer(conf.getInt("content.server.port",50000), "build/test/data/fetch-test-site"); + server.start(); + } + + protected void tearDown() throws InterruptedException, IOException{ + server.stop(); + } + + public void testFetch() throws IOException { + + //generate seedlist + ArrayList<String> urls=new ArrayList<String>(); + + addUrl(urls,"index.html"); + addUrl(urls,"pagea.html"); + addUrl(urls,"pageb.html"); + addUrl(urls,"dup_of_pagea.html"); + + CrawlDBTestUtil.generateSeedList(fs, urlPath, urls); + + //inject + Injector injector=new Injector(conf); + injector.inject(crawldbPath, urlPath); + + //generate + Generator g=new Generator(conf); + Path generatedSegment=g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE); + + long time=System.currentTimeMillis(); + //fetch + Fetcher fetcher=new Fetcher(conf); + fetcher.fetch(generatedSegment, 1, true); + + time=System.currentTimeMillis()-time; + + //verify politeness, time taken should be more than (num_of_pages +1)*delay + assertTrue(1000*time > (urls.size() + 1 * conf.getInt("fetcher.server.delay",5))); + + //verify results + Path content=new Path(new Path(generatedSegment, Content.DIR_NAME),"part-00000/data"); + SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf); + + ArrayList<String> handledurls=new ArrayList<String>(); + + READ: + do { + UTF8 key=new UTF8(); + Content value=new Content(); + if(!reader.next(key, value)) break READ; + handledurls.add(key.toString()); + } while(true); + + reader.close(); + + Collections.sort(urls); + Collections.sort(handledurls); + + //verify that enough pages were handled + assertEquals(urls.size(), handledurls.size()); + + //verify that correct pages were handled + assertTrue(handledurls.containsAll(urls)); + assertTrue(urls.containsAll(handledurls)); + + } + + private void addUrl(ArrayList<String> urls, String page) { + urls.add("http://127.0.0.1:" + server.getListeners()[0].getPort() + "/" + page); + } + +} Added: lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html?rev=439610&view=auto ============================================================================== --- lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html (added) +++ lucene/nutch/trunk/src/testresources/fetch-test-site/dup_of_pagea.html Sat Sep 2 08:44:28 2006 @@ -0,0 +1,9 @@ +<html> + <head> + <title>page a</title> + </head> +<body> +This is page a +<a href="index.html">home</a> +</body> +</html> \ No newline at end of file Added: lucene/nutch/trunk/src/testresources/fetch-test-site/index.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/index.html?rev=439610&view=auto ============================================================================== --- lucene/nutch/trunk/src/testresources/fetch-test-site/index.html (added) +++ lucene/nutch/trunk/src/testresources/fetch-test-site/index.html Sat Sep 2 08:44:28 2006 @@ -0,0 +1,11 @@ +<html> + <head> + <title>front page</title> + </head> +<body> +This is front page. +<a href="pagea.html">Page a</a> +<a href="pageb.html">Page b</a> +<a href="dup_of_pagea.html">dup of Page a</a> +</body> +</html> \ No newline at end of file Added: lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html?rev=439610&view=auto ============================================================================== --- lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html (added) +++ lucene/nutch/trunk/src/testresources/fetch-test-site/pagea.html Sat Sep 2 08:44:28 2006 @@ -0,0 +1,9 @@ +<html> + <head> + <title>page a</title> + </head> +<body> +This is page a +<a href="index.html">home</a> +</body> +</html> \ No newline at end of file Added: lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html?rev=439610&view=auto ============================================================================== --- lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html (added) +++ lucene/nutch/trunk/src/testresources/fetch-test-site/pageb.html Sat Sep 2 08:44:28 2006 @@ -0,0 +1,9 @@ +<html> + <head> + <title>bage b</title> + </head> +<body> +This is page b +<a href="index.html">home</a> +</body> +</html> \ No newline at end of file Added: lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/robots.txt?rev=439610&view=auto ============================================================================== (empty) ------------------------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs