Author: markus
Date: Fri Mar  7 10:04:42 2014
New Revision: 1575213

URL: http://svn.apache.org/r1575213
Log:
NUTCH-1113 Sebastian's fix for the unit test

Modified:
    
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java

Modified: 
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java?rev=1575213&r1=1575212&r2=1575213&view=diff
==============================================================================
--- 
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java 
(original)
+++ 
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java 
Fri Mar  7 10:04:42 2014
@@ -28,6 +28,8 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.MapFileOutputFormat;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import junit.framework.TestCase;
 
@@ -51,6 +53,9 @@ public class TestSegmentMergerCrawlDatum
   Configuration conf;
   FileSystem fs;
   Random rnd;
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestSegmentMergerCrawlDatums.class);
   
   public void setUp() throws Exception {
     conf = NutchConfiguration.create();
@@ -62,7 +67,10 @@ public class TestSegmentMergerCrawlDatum
    *
    */
   public void testSingleRandomSequence() throws Exception {
-    assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), new 
Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE, 
CrawlDatum.STATUS_FETCH_SUCCESS, 256, false)));
+    assertEquals(
+        new Byte(CrawlDatum.STATUS_FETCH_SUCCESS),
+        new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE,
+            CrawlDatum.STATUS_FETCH_SUCCESS, 256, false)));
   }
   
   /**
@@ -103,9 +111,23 @@ public class TestSegmentMergerCrawlDatum
   public void testRandomizedSequences() throws Exception {
     for (int i = 0; i < rnd.nextInt(16) + 16; i++) {
       byte expectedStatus = (byte)(rnd.nextInt(6) + 0x21);
+      while (expectedStatus == CrawlDatum.STATUS_FETCH_RETRY
+          || expectedStatus == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+        // fetch_retry and fetch_notmodified never remain in a merged segment
+        expectedStatus = (byte) (rnd.nextInt(6) + 0x21);
+      }
       byte randomStatus = (byte)(rnd.nextInt(6) + 0x21);
+      int rounds = rnd.nextInt(16) + 32;
+      boolean withRedirects = rnd.nextBoolean();
       
-      assertEquals(new Byte(expectedStatus), new 
Byte(executeSequence(randomStatus, expectedStatus, rnd.nextInt(16) + 32, 
rnd.nextBoolean())));
+      byte resultStatus = executeSequence(randomStatus, expectedStatus,
+          rounds, withRedirects);
+      assertEquals(
+          "Expected status = " + CrawlDatum.getStatusName(expectedStatus)
+              + ", but got " + CrawlDatum.getStatusName(resultStatus)
+              + " when merging " + rounds + " segments"
+              + (withRedirects ? " with redirects" : ""), expectedStatus,
+          resultStatus);
     }
   }
   
@@ -145,14 +167,12 @@ public class TestSegmentMergerCrawlDatum
     // Our test directory
     Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + 
System.currentTimeMillis());
     
-    Path segment1 = new Path(testDir, "00001");
-    Path segment2 = new Path(testDir, "00002");
-    Path segment3 = new Path(testDir, "00003");
+    Path segment = new Path(testDir, "00001");
     
-    createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, true, true);
+    createSegment(segment, CrawlDatum.STATUS_FETCH_SUCCESS, true, true);
     
     // Merge the segments and get status
-    Path mergedSegment = merge(testDir, new Path[]{segment3});
+    Path mergedSegment = merge(testDir, new Path[]{segment});
     Byte status = new Byte(status = checkMergedSegment(testDir, 
mergedSegment));
     
     assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
@@ -187,7 +207,7 @@ public class TestSegmentMergerCrawlDatum
    * @param whether redirects are injected randomly
    * @return the CrawlDatum status
    */
-  protected byte executeSequence(byte firstSatus, byte lastStatus, int rounds, 
boolean redirect) throws Exception {
+  protected byte executeSequence(byte firstStatus, byte lastStatus, int 
rounds, boolean redirect) throws Exception {
     // Our test directory
     Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + 
System.currentTimeMillis());
     
@@ -202,7 +222,7 @@ public class TestSegmentMergerCrawlDatum
     }
        
     // Create the first segment according to the specified status
-    createSegment(segmentPaths[0], firstSatus, false);
+    createSegment(segmentPaths[0], firstStatus, false);
     
     // Create N segments with random status and optionally with randomized 
redirect injection
     for (int i = 1; i < rounds - 1; i++) {
@@ -210,14 +230,17 @@ public class TestSegmentMergerCrawlDatum
       byte status = (byte)(rnd.nextInt(6) + 0x21);
       
       // Whether this is going to be a redirect
-      boolean needsToRedirect = redirect ? rnd.nextBoolean() : false;
-      boolean redirectAndFetch = redirect ? rnd.nextBoolean() : false;
+      boolean addRedirect = redirect ? rnd.nextBoolean() : false;
+      // If it's a redirect we add a datum resulting from a fetch at random,
+      // if not: always add a fetch datum to avoid empty segments
+      boolean addFetch = addRedirect ? rnd.nextBoolean() : true;
       
-      createSegment(segmentPaths[i], status, needsToRedirect, 
redirectAndFetch);
+      createSegment(segmentPaths[i], status, addFetch, addRedirect);
     }
 
     // Create the last segment according to the specified status
-    createSegment(segmentPaths[rounds - 1], lastStatus, redirect ? 
rnd.nextBoolean() : false, redirect ? rnd.nextBoolean() : false);    
+    // (additionally, add a redirect at random)
+    createSegment(segmentPaths[rounds - 1], lastStatus, true, redirect ? 
rnd.nextBoolean() : false);
     
     // Merge the segments!
     Path mergedSegment = merge(testDir, segmentPaths);
@@ -243,7 +266,7 @@ public class TestSegmentMergerCrawlDatum
     
     for (MapFile.Reader reader : readers) {
       while (reader.next(key, value)) {
-        System.out.println("Reading status for: " + key.toString() + " > " + 
CrawlDatum.getStatusName(value.getStatus()));
+        LOG.info("Reading status for: " + key.toString() + " > " + 
CrawlDatum.getStatusName(value.getStatus()));
         
         // Only consider fetch status
         if (CrawlDatum.hasFetchStatus(value) && 
key.toString().equals("http://nutch.apache.org/";)) {
@@ -258,7 +281,7 @@ public class TestSegmentMergerCrawlDatum
     // Remove the test directory again
     fs.delete(testDir, true);
     
-    System.out.println("Final fetch status for: http://nutch.apache.org/ > " + 
CrawlDatum.getStatusName(finalStatus));
+    LOG.info("Final fetch status for: http://nutch.apache.org/ > " + 
CrawlDatum.getStatusName(finalStatus));
 
     // Return the final status
     return finalStatus;
@@ -301,7 +324,7 @@ public class TestSegmentMergerCrawlDatum
   }
 
   protected void createSegment(Path segment, byte status, boolean fetch, 
boolean redirect) throws Exception {
-    System.out.println("\nSegment: " + segment.toString());
+    LOG.info("\nSegment: " + segment.toString());
     
     // The URL of our main record
     String url = "http://nutch.apache.org/";;
@@ -324,7 +347,7 @@ public class TestSegmentMergerCrawlDatum
     // - before fetch status to check whether fetch datum is preferred over 
linked datum when merging
     if (redirect) {
       // We're writing our our main record URL with status linked
-      System.out.println(url + " > " + 
CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
+      LOG.info(url + " > " + 
CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
       value = new CrawlDatum();
       value.setStatus(CrawlDatum.STATUS_LINKED);
       writer.append(new Text(url), value);
@@ -332,7 +355,7 @@ public class TestSegmentMergerCrawlDatum
 
     // Whether we're fetching now
     if (fetch) {
-      System.out.println(url + " > " + CrawlDatum.getStatusName(status));
+      LOG.info(url + " > " + CrawlDatum.getStatusName(status));
       
       // Set the status
       value.setStatus(status);
@@ -344,7 +367,7 @@ public class TestSegmentMergerCrawlDatum
     // Whether we're handing a redirect now
     if (redirect) {
       // And the redirect URL with redirect status, pointing to our main URL
-      System.out.println(redirectUrl + " > " + 
CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
+      LOG.info(redirectUrl + " > " + 
CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
       value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
       writer.append(new Text(redirectUrl), value);
     }


Reply via email to