Author: siren Date: Sat Sep 2 05:38:50 2006 New Revision: 439582 URL: http://svn.apache.org/viewvc?rev=439582&view=rev Log: Add simple unit tests for injector and generator
Added: lucene/nutch/trunk/src/test/crawl-tests.xml lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Added: lucene/nutch/trunk/src/test/crawl-tests.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=439582&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/crawl-tests.xml (added) +++ lucene/nutch/trunk/src/test/crawl-tests.xml Sat Sep 2 05:38:50 2006 @@ -0,0 +1,13 @@ +<?xml version="1.0"?> + +<!-- Configuration overrides used during unit tests. --> + +<configuration> + +<property> + <name>plugin.includes</name> + <value>urlfilter-suffix|scoring-opic</value> + <description>Enable required plugins.</description> +</property> + +</configuration> \ No newline at end of file Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=439582&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Sat Sep 2 05:38:50 2006 @@ -0,0 +1,69 @@ +package org.apache.nutch.crawl; + +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.UTF8; + +public class CrawlDBTestUtil { + + private static final Log LOG = LogFactory.getLog(CrawlDBTestUtil.class); + + /** + * Creates synthetic crawldb + * + * @param fs + * filesystem where db will be created + * @param crawldb + * path were db will be created + * @param init + * urls to be inserted, objects are of type URLCrawlDatum + * @throws Exception + */ + public static void createCrawlDb(FileSystem fs, Path crawldb, List<URLCrawlDatum> init) + throws Exception { + LOG.trace("* creating crawldb: " + crawldb); + Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME); + MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000") + .toString(), UTF8.class, CrawlDatum.class); + Iterator<URLCrawlDatum> it = init.iterator(); + while (it.hasNext()) { + URLCrawlDatum row = it.next(); + LOG.info("adding:" + row.url.toString()); + writer.append(new UTF8(row.url), row.datum); + } + writer.close(); + } + + /** + * For now we need to manually construct our Configuration, because we need to + * override the default one and it is currently not possible to use dynamically + * set values. + * + * @return + */ + public static Configuration create(){ + Configuration conf=new Configuration(); + conf.addDefaultResource("nutch-default.xml"); + conf.addFinalResource("crawl-tests.xml"); + return conf; + } + + public static class URLCrawlDatum { + + UTF8 url; + + CrawlDatum datum; + + public URLCrawlDatum(UTF8 url, CrawlDatum datum) { + this.url = url; + this.datum = datum; + } + } +} Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=439582&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sat Sep 2 05:38:50 2006 @@ -0,0 +1,147 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.UTF8; +import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum; + +import junit.framework.TestCase; + +/** + * Basic generator test: + * 1. Insert entries in crawldb + * 2. Generates entries to fetch + * 3. Verifies that number of generated urls match + * 4. Verifies that highest scoring urls are generated + + * @author nutch-dev <nutch-dev at lucene.apache.org> + * + */ +public class TestGenerator extends TestCase { + + Configuration conf; + + Path dbDir; + + Path segmentsDir; + + FileSystem fs; + + protected void setUp() throws Exception { + conf = CrawlDBTestUtil.create(); + } + + protected void tearDown() { + delete(dbDir); + delete(segmentsDir); + } + + private void delete(Path p) { + try { + fs.delete(p); + } catch (IOException e) { + } + } + + /** + * Test that generator generates fetchlish ordered by score (desc) + * + * @throws Exception + */ + public void testGenerateHighest() throws Exception { + + int NUM_RESULTS=2; + + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + for(int i=0;i<=100;i++){ + list.add(new CrawlDBTestUtil.URLCrawlDatum(new UTF8("http://aaa/" + pad(i)), + new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i))); + } + + fs = FileSystem.get(conf); + dbDir = new Path("test-crawldb-" + new java.util.Random().nextInt()); + segmentsDir = new Path("test-crawldb-segments" + new java.util.Random().nextInt()); + fs.mkdirs(dbDir); + fs.mkdirs(segmentsDir); + + // create crawldb + CrawlDBTestUtil.createCrawlDb(fs, dbDir, list); + + // generate segment + Generator g=new Generator(conf); + Path generatedSegment=g.generate(dbDir, segmentsDir,0,NUM_RESULTS, Long.MAX_VALUE); + + Path fetchlist=new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME),"part-00000"); + + // verify results + SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf); + + ArrayList<URLCrawlDatum> l=new ArrayList<URLCrawlDatum>(); + + READ: + do { + UTF8 key=new UTF8(); + CrawlDatum value=new CrawlDatum(); + if(!reader.next(key, value)) break READ; + l.add(new URLCrawlDatum(key, value)); + } while(true); + + reader.close(); + + // sort urls by score desc + Collections.sort(l, new ScoreComparator()); + + //verify we got right amount of records + assertEquals(NUM_RESULTS, l.size()); + + //verify we have the highest scoring urls + assertEquals("http://aaa/100", (l.get(0).url.toString())); + assertEquals("http://aaa/099", (l.get(1).url.toString())); + } + + private String pad(int i) { + String s=Integer.toString(i); + while(s.length()<3) + s="0" + s; + return s; + } + + /** + * Comparator that sorts by score desc + */ + public class ScoreComparator implements Comparator<URLCrawlDatum> { + + public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) { + + if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) + return -1; + if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) + return 1; + + return 0; + } + } +} Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=439582&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Sat Sep 2 05:38:50 2006 @@ -0,0 +1,128 @@ +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.UTF8; + +import junit.framework.TestCase; + +/** + * Basic injector test: + * 1. Creates a text file with urls + * 2. Injects them into crawldb + * 3. Reads crawldb entries and verifies contents + * 4. Injects more urls into webdb + * 5. Reads crawldb entries and verifies contents + * + * @author nutch-dev <nutch-dev at lucene.apache.org> + */ +public class TestInjector extends TestCase { + + private FSDataOutputStream out; + private Configuration conf; + private FileSystem fs; + final static Path testdir=new Path("build/test/inject-test"); + Path crawldbPath; + Path urlPath; + + protected void setUp() throws Exception { + conf = CrawlDBTestUtil.create(); + urlPath=new Path(testdir,"urls"); + crawldbPath=new Path(testdir,"crawldb"); + fs=FileSystem.get(conf); + + } + + protected void tearDown() throws IOException{ + fs.delete(testdir); + } + + public void testInject() throws IOException { + ArrayList<String> urls=new ArrayList<String>(); + for(int i=0;i<100;i++) { + urls.add("http://zzz/" + i + ".html"); + } + generateSeedList(urls); + + Injector injector=new Injector(conf); + injector.inject(crawldbPath, urlPath); + + // verify results + List<String>read=readCrawldb(); + + Collections.sort(read); + Collections.sort(urls); + + assertEquals(urls.size(), read.size()); + + assertTrue(read.containsAll(urls)); + assertTrue(urls.containsAll(read)); + + //inject more urls + ArrayList<String> urls2=new ArrayList<String>(); + for(int i=0;i<100;i++) { + urls2.add("http://xxx/" + i + ".html"); + } + generateSeedList(urls2); + injector.inject(crawldbPath, urlPath); + urls.addAll(urls2); + + // verify results + read=readCrawldb(); + + + Collections.sort(read); + Collections.sort(urls); + + assertEquals(urls.size(), read.size()); + + assertTrue(read.containsAll(urls)); + assertTrue(urls.containsAll(read)); + + } + + /** + * Generate seedlist + * @throws IOException + */ + private void generateSeedList(List<String> contents) throws IOException{ + Path file=new Path(urlPath,"urls.txt"); + fs.mkdirs(urlPath); + out=fs.create(file); + Iterator<String> iterator=contents.iterator(); + while(iterator.hasNext()){ + String url=iterator.next(); + out.writeBytes(url); + out.writeBytes("\n"); + } + out.flush(); + out.close(); + } + + private List<String> readCrawldb() throws IOException{ + Path dbfile=new Path(crawldbPath,CrawlDatum.DB_DIR_NAME + "/part-00000/data"); + System.out.println("reading:" + dbfile); + SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf); + ArrayList<String> read=new ArrayList<String>(); + + READ: + do { + UTF8 key=new UTF8(); + CrawlDatum value=new CrawlDatum(); + if(!reader.next(key, value)) break READ; + read.add(key.toString()); + } while(true); + + return read; + } + +} ------------------------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs