add unit tests based on MRUnit

Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/288dceed
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/288dceed
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/288dceed

Branch: refs/heads/master
Commit: 288dceedb7de28457878eecb03a571d082a48cc2
Parents: 3c691eb
Author: Sebastian Nagel <[email protected]>
Authored: Sun Jan 17 21:32:31 2016 +0100
Committer: Sebastian Nagel <[email protected]>
Committed: Thu Feb 25 21:26:30 2016 +0100

----------------------------------------------------------------------
 ivy/ivy.xml                                     |  10 +-
 ivy/ivysettings.xml                             |   2 +-
 src/java/org/apache/nutch/crawl/Injector.java   |   7 +-
 .../nutch/crawl/CrawlDbUpdateTestDriver.java    | 138 +++++++++++++++++++
 .../apache/nutch/crawl/TestCrawlDbStates.java   |   7 +-
 .../org/apache/nutch/crawl/TestInjector.java    |   3 +-
 .../org/apache/nutch/fetcher/TestFetcher.java   |   2 +-
 7 files changed, 156 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 206cce7..bc8d293 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -11,7 +11,7 @@
        OF ANY KIND, either express or implied. See the License for the 
specific 
        language governing permissions and limitations under the License. -->
 
-<ivy-module version="1.0">
+<ivy-module version="1.0" xmlns:maven="http://ant.apache.org/ivy/maven";>
        <info organisation="org.apache.nutch" module="nutch">
                <license name="Apache 2.0"
                        url="http://www.apache.org/licenses/LICENSE-2.0.txt/"; />
@@ -98,6 +98,10 @@
 
                <!--artifacts needed for testing -->
                <dependency org="junit" name="junit" rev="4.11" 
conf="test->default" />
+               <dependency org="org.apache.mrunit" name="mrunit" rev="1.1.0" 
conf="test->default">
+                       <artifact name="mrunit" maven:classifier="hadoop2" />
+                       <exclude org="log4j" module="log4j" />
+               </dependency>
                <dependency org="org.mortbay.jetty" name="jetty-client" 
rev="6.1.22" conf="test->default" />
                <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" 
conf="test->default" />
                <dependency org="org.mortbay.jetty" name="jetty-util" 
rev="6.1.22" conf="test->default" />
@@ -125,9 +129,7 @@
                <exclude module="jmxtools" />
                <exclude module="jms" />
                <exclude module="jmxri" />
-        <exclude org="com.thoughtworks.xstream"/>
-        <exclude org="org.apache.mrunit"/>
-        <exclude org="com.thoughtworks.xstream"/>
+               <exclude org="com.thoughtworks.xstream"/>
 
        </dependencies>
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/ivy/ivysettings.xml
----------------------------------------------------------------------
diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml
index 0319333..d9b5044 100644
--- a/ivy/ivysettings.xml
+++ b/ivy/ivysettings.xml
@@ -35,7 +35,7 @@
     value="https://repository.apache.org/content/repositories/snapshots/";
     override="false"/>
   <property name="maven2.pattern"
-    value="[organisation]/[module]/[revision]/[module]-[revision]"/>
+    
value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/>
   <property name="maven2.pattern.ext"
     value="${maven2.pattern}.[ext]"/>
   <!-- pull in the local repository -->

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/java/org/apache/nutch/crawl/Injector.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/Injector.java 
b/src/java/org/apache/nutch/crawl/Injector.java
index 0d01dc8..383aaf1 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -319,12 +319,13 @@ public class Injector extends NutchTool implements Tool {
     setConf(conf);
   }
 
-  public void inject(Path crawlDb, Path urlDir) throws Exception {
+  public void inject(Path crawlDb, Path urlDir)
+      throws IOException, ClassNotFoundException, InterruptedException {
     inject(crawlDb, urlDir, false, false);
   }
 
   public void inject(Path crawlDb, Path urlDir, boolean overwrite,
-      boolean update) throws Exception {
+      boolean update) throws IOException, ClassNotFoundException, 
InterruptedException {
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
 
@@ -397,7 +398,7 @@ public class Injector extends NutchTool implements Tool {
         LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
             + TimingUtil.elapsedTime(start, end));
       }
-    } catch (Exception e) {
+    } catch (IOException e) {
       if (fs.exists(tempCrawlDb)) {
         fs.delete(tempCrawlDb, true);
       }

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java 
b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
new file mode 100644
index 0000000..7238f88
--- /dev/null
+++ b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configuration.IntegerRanges;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.Counters;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Partitioner;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch
+ * status)
+ */
+public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, 
CrawlDatum>> {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbUpdateTestDriver.class);
+
+  private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver;
+  private T reducer;
+  private Configuration configuration;
+
+  public static Text dummyURL = new Text("http://nutch.apache.org/";);
+
+//  protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) {
+  protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) {
+    reducer = updateReducer;
+    configuration = conf;
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   *
+   * @param values
+   *          list of input CrawlDatums
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(List<CrawlDatum> values) {
+    List<CrawlDatum> result = new ArrayList<CrawlDatum>(0);
+    if (values == null || values.size() == 0) {
+      return result;
+    }
+    Collections.shuffle(values); // sorting of values should have no influence
+    reduceDriver = ReduceDriver.newReduceDriver(reducer);
+    reduceDriver.setConfiguration(configuration);
+    reduceDriver.withInput(dummyURL, values);
+    List<Pair<Text,CrawlDatum>> reduceResult;
+    try {
+      reduceResult = reduceDriver.run();
+      for (Pair<Text,CrawlDatum> p : reduceResult) {
+        if (p.getFirst().equals(dummyURL)) {
+          result.add(p.getSecond());
+        }
+      }
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      return result;
+    }
+    return result;
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   *
+   * @param dbDatum
+   *          previous CrawlDatum in CrawlDb
+   * @param fetchDatum
+   *          CrawlDatum resulting from fetching
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
+    List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+    if (dbDatum != null)
+      values.add(dbDatum);
+    if (fetchDatum != null)
+      values.add(fetchDatum);
+    return update(values);
+  }
+
+  /**
+   * see {@link #update(List)}
+   */
+  public List<CrawlDatum> update(CrawlDatum... values) {
+    return update(Arrays.asList(values));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java 
b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
index bf951fd..c54559b 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -195,8 +195,9 @@ public class TestCrawlDbStates {
   public void testCrawlDbStatTransitionInject() {
     LOG.info("Test CrawlDatum states in Injector after inject");
     Configuration conf = CrawlDBTestUtil.createConfiguration();
-    CrawlDbUpdateUtil<Injector.InjectReducer> inject = new 
CrawlDbUpdateUtil<Injector.InjectReducer>(
-        new Injector.InjectReducer(), conf);
+    Injector.InjectReducer injector = new Injector.InjectReducer();
+    CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver =
+        new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf);
     ScoringFilters scfilters = new ScoringFilters(conf);
     for (String sched : schedules) {
       LOG.info("Testing inject with " + sched);
@@ -229,7 +230,7 @@ public class TestCrawlDbStates {
           LOG.error(StringUtils.stringifyException(e));
         }
         values.add(injected);
-        List<CrawlDatum> res = inject.update(values);
+        List<CrawlDatum> res = injectDriver.update(values);
         if (res.size() != 1) {
           fail("Inject didn't result in one single CrawlDatum per URL");
           continue;

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/crawl/TestInjector.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/crawl/TestInjector.java 
b/src/test/org/apache/nutch/crawl/TestInjector.java
index ade6494..135f392 100644
--- a/src/test/org/apache/nutch/crawl/TestInjector.java
+++ b/src/test/org/apache/nutch/crawl/TestInjector.java
@@ -66,7 +66,8 @@ public class TestInjector {
   }
 
   @Test
-  public void testInject() throws IOException {
+  public void testInject()
+      throws IOException, ClassNotFoundException, InterruptedException {
     ArrayList<String> urls = new ArrayList<String>();
     // We'll use a separate list for MD so we can still compare url with
     // containsAll

http://git-wip-us.apache.org/repos/asf/nutch/blob/288dceed/src/test/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java 
b/src/test/org/apache/nutch/fetcher/TestFetcher.java
index 1196284..fae5f90 100644
--- a/src/test/org/apache/nutch/fetcher/TestFetcher.java
+++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java
@@ -79,7 +79,7 @@ public class TestFetcher {
   }
 
   @Test
-  public void testFetch() throws IOException {
+  public void testFetch() throws IOException, ClassNotFoundException, 
InterruptedException {
 
     // generate seedlist
     ArrayList<String> urls = new ArrayList<String>();

Reply via email to