add unit tests based on MRUnit
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/288dceed Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/288dceed Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/288dceed Branch: refs/heads/master Commit: 288dceedb7de28457878eecb03a571d082a48cc2 Parents: 3c691eb Author: Sebastian Nagel <[email protected]> Authored: Sun Jan 17 21:32:31 2016 +0100 Committer: Sebastian Nagel <[email protected]> Committed: Thu Feb 25 21:26:30 2016 +0100 ---------------------------------------------------------------------- ivy/ivy.xml | 10 +- ivy/ivysettings.xml | 2 +- src/java/org/apache/nutch/crawl/Injector.java | 7 +- .../nutch/crawl/CrawlDbUpdateTestDriver.java | 138 +++++++++++++++++++ .../apache/nutch/crawl/TestCrawlDbStates.java | 7 +- .../org/apache/nutch/crawl/TestInjector.java | 3 +- .../org/apache/nutch/fetcher/TestFetcher.java | 2 +- 7 files changed, 156 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivy.xml ---------------------------------------------------------------------- diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 206cce7..bc8d293 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -11,7 +11,7 @@ OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> -<ivy-module version="1.0"> +<ivy-module version="1.0" xmlns:maven="http://ant.apache.org/ivy/maven"> <info organisation="org.apache.nutch" module="nutch"> <license name="Apache 2.0" url="http://www.apache.org/licenses/LICENSE-2.0.txt/" /> @@ -98,6 +98,10 @@ <!--artifacts needed for testing --> <dependency org="junit" name="junit" rev="4.11" conf="test->default" /> + <dependency org="org.apache.mrunit" name="mrunit" rev="1.1.0" conf="test->default"> + <artifact name="mrunit" maven:classifier="hadoop2" /> + <exclude org="log4j" module="log4j" /> + </dependency> <dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" /> <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" /> <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" /> @@ -125,9 +129,7 @@ <exclude module="jmxtools" /> <exclude module="jms" /> <exclude module="jmxri" /> - <exclude org="com.thoughtworks.xstream"/> - <exclude org="org.apache.mrunit"/> - <exclude org="com.thoughtworks.xstream"/> + <exclude org="com.thoughtworks.xstream"/> </dependencies> http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivysettings.xml ---------------------------------------------------------------------- diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml index 0319333..d9b5044 100644 --- a/ivy/ivysettings.xml +++ b/ivy/ivysettings.xml @@ -35,7 +35,7 @@ value="https://repository.apache.org/content/repositories/snapshots/" override="false"/> <property name="maven2.pattern" - value="[organisation]/[module]/[revision]/[module]-[revision]"/> + value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/> <property name="maven2.pattern.ext" value="${maven2.pattern}.[ext]"/> <!-- pull in the local repository --> http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/java/org/apache/nutch/crawl/Injector.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 0d01dc8..383aaf1 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -319,12 +319,13 @@ public class Injector extends NutchTool implements Tool { setConf(conf); } - public void inject(Path crawlDb, Path urlDir) throws Exception { + public void inject(Path crawlDb, Path urlDir) + throws IOException, ClassNotFoundException, InterruptedException { inject(crawlDb, urlDir, false, false); } public void inject(Path crawlDb, Path urlDir, boolean overwrite, - boolean update) throws Exception { + boolean update) throws IOException, ClassNotFoundException, InterruptedException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -397,7 +398,7 @@ public class Injector extends NutchTool implements Tool { LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } - } catch (Exception e) { + } catch (IOException e) { if (fs.exists(tempCrawlDb)) { fs.delete(tempCrawlDb, true); } http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java new file mode 100644 index 0000000..7238f88 --- /dev/null +++ b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configuration.IntegerRanges; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.RawComparator; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.Counters; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.JobID; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.OutputCommitter; +import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.mapreduce.Partitioner; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.Reducer.Context; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.mapreduce.TaskInputOutputContext; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.mrunit.mapreduce.ReduceDriver; +import org.apache.hadoop.mrunit.types.Pair; + +/** + * Utility to test transitions of {@link CrawlDatum} states during an update of + * {@link CrawlDb} (command {@literal updatedb}): call + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch + * status) + */ +public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> { + + private static final Logger LOG = LoggerFactory + .getLogger(CrawlDbUpdateTestDriver.class); + + private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver; + private T reducer; + private Configuration configuration; + + public static Text dummyURL = new Text("http://nutch.apache.org/"); + +// protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) { + protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) { + reducer = updateReducer; + configuration = conf; + } + + /** + * run + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * and return the CrawlDatum(s) which would have been written into CrawlDb + * + * @param values + * list of input CrawlDatums + * @return list of resulting CrawlDatum(s) in CrawlDb + */ + public List<CrawlDatum> update(List<CrawlDatum> values) { + List<CrawlDatum> result = new ArrayList<CrawlDatum>(0); + if (values == null || values.size() == 0) { + return result; + } + Collections.shuffle(values); // sorting of values should have no influence + reduceDriver = ReduceDriver.newReduceDriver(reducer); + reduceDriver.setConfiguration(configuration); + reduceDriver.withInput(dummyURL, values); + List<Pair<Text,CrawlDatum>> reduceResult; + try { + reduceResult = reduceDriver.run(); + for (Pair<Text,CrawlDatum> p : reduceResult) { + if (p.getFirst().equals(dummyURL)) { + result.add(p.getSecond()); + } + } + } catch (IOException e) { + LOG.error(StringUtils.stringifyException(e)); + return result; + } + return result; + } + + /** + * run + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} + * and return the CrawlDatum(s) which would have been written into CrawlDb + * + * @param dbDatum + * previous CrawlDatum in CrawlDb + * @param fetchDatum + * CrawlDatum resulting from fetching + * @return list of resulting CrawlDatum(s) in CrawlDb + */ + public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) { + List<CrawlDatum> values = new ArrayList<CrawlDatum>(); + if (dbDatum != null) + values.add(dbDatum); + if (fetchDatum != null) + values.add(fetchDatum); + return update(values); + } + + /** + * see {@link #update(List)} + */ + public List<CrawlDatum> update(CrawlDatum... values) { + return update(Arrays.asList(values)); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java index bf951fd..c54559b 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java @@ -195,8 +195,9 @@ public class TestCrawlDbStates { public void testCrawlDbStatTransitionInject() { LOG.info("Test CrawlDatum states in Injector after inject"); Configuration conf = CrawlDBTestUtil.createConfiguration(); - CrawlDbUpdateUtil<Injector.InjectReducer> inject = new CrawlDbUpdateUtil<Injector.InjectReducer>( - new Injector.InjectReducer(), conf); + Injector.InjectReducer injector = new Injector.InjectReducer(); + CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver = + new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf); ScoringFilters scfilters = new ScoringFilters(conf); for (String sched : schedules) { LOG.info("Testing inject with " + sched); @@ -229,7 +230,7 @@ public class TestCrawlDbStates { LOG.error(StringUtils.stringifyException(e)); } values.add(injected); - List<CrawlDatum> res = inject.update(values); + List<CrawlDatum> res = injectDriver.update(values); if (res.size() != 1) { fail("Inject didn't result in one single CrawlDatum per URL"); continue; http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/TestInjector.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/crawl/TestInjector.java b/src/test/org/apache/nutch/crawl/TestInjector.java index ade6494..135f392 100644 --- a/src/test/org/apache/nutch/crawl/TestInjector.java +++ b/src/test/org/apache/nutch/crawl/TestInjector.java @@ -66,7 +66,8 @@ public class TestInjector { } @Test - public void testInject() throws IOException { + public void testInject() + throws IOException, ClassNotFoundException, InterruptedException { ArrayList<String> urls = new ArrayList<String>(); // We'll use a separate list for MD so we can still compare url with // containsAll http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/fetcher/TestFetcher.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java b/src/test/org/apache/nutch/fetcher/TestFetcher.java index 1196284..fae5f90 100644 --- a/src/test/org/apache/nutch/fetcher/TestFetcher.java +++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java @@ -79,7 +79,7 @@ public class TestFetcher { } @Test - public void testFetch() throws IOException { + public void testFetch() throws IOException, ClassNotFoundException, InterruptedException { // generate seedlist ArrayList<String> urls = new ArrayList<String>();
