TestCrawlDbFilter.java

lewismc Tue, 29 Jan 2013 20:02:57 -0800

Author: lewismc
Date: Wed Jan 30 04:02:28 2013
New Revision: 1440266

URL: http://svn.apache.org/viewvc?rev=1440266&view=rev
Log:
NUTCH-1521 CrawlDbFilter pass null url to urlNormalizers


Added:
    nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1440266&r1=1440265&r2=1440266&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 30 04:02:28 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1521 CrawlDbFilter pass null url to urlNormalizers (Lufeng via lewismc)
+
 * NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas 
Patil)
 
 * NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?rev=1440266&r1=1440265&r2=1440266&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Wed Jan 30 
04:02:28 2013
@@ -84,7 +84,7 @@ public class CrawlDbFilter implements Ma
     if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
       url = null;
     }
-    if (urlNormalizers) {
+    if (url != null && urlNormalizers) {
       try {
         url = normalizers.normalize(url, scope); // normalize the url
       } catch (Exception e) {

Added: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java?rev=1440266&view=auto
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java (added)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java Wed Jan 
30 04:02:28 2013
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+
+import junit.framework.TestCase;
+import org.apache.nutch.util.NutchJob;
+
+/**
+ * CrawlDbFiltering test which tests for correct, error free url 
+ * normalization when the CrawlDB includes urls with <code>DB GONE</code> 
status 
+ * and <code>CRAWLDB_PURGE_404</code> is set to true.
+ * 
+ * @author lufeng
+ */
+public class TestCrawlDbFilter extends TestCase {
+  Configuration conf;
+  Path dbDir;
+  Path newCrawlDb;
+  final static Path testdir = new Path("build/test/crawldbfilter-test");
+  FileSystem fs;
+
+  protected void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    fs = FileSystem.get(conf);
+    fs.delete(testdir, true);
+  }
+
+  protected void tearDown() {
+    delete(testdir);
+  }
+
+  private void delete(Path p) {
+    try {
+      fs.delete(p, true);
+    } catch (IOException e) {
+    }
+  }
+
+  /**
+   * Test url404Purging
+   *
+   * @throws Exception
+   */
+  public void testUrl404Purging() throws Exception {
+    // create a CrawlDatum with DB GONE status
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+    list.add(new URLCrawlDatum(new Text("http://www.example.com";), new 
CrawlDatum(
+      CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example1.com";), new 
CrawlDatum(
+      CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example2.com";), new 
CrawlDatum(
+      CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
+    dbDir = new Path(testdir, "crawldb");
+    newCrawlDb = new Path(testdir,"newcrawldb");
+    // create crawldb
+    CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+    // set CRAWLDB_PURGE_404 to true
+    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404,true);
+    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING,true);
+    conf.setBoolean(CrawlDbFilter.URL_FILTERING,false);
+    conf.setInt("urlnormalizer.loop.count", 2);
+    JobConf job = new NutchJob(conf);
+    job.setJobName("Test CrawlDbFilter");
+    Path current = new Path(dbDir, "current");
+    if (FileSystem.get(job).exists(current)) {
+      FileInputFormat.addInputPath(job, current);
+    }
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(CrawlDbFilter.class);
+    job.setReducerClass(CrawlDbReducer.class);
+    FileOutputFormat.setOutputPath(job, newCrawlDb);
+    job.setOutputFormat(MapFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    JobClient.runJob(job);
+
+    Path fetchlist = new Path(new Path(newCrawlDb,
+      "part-00000"), "data");
+
+    ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+    // verify we got right amount of records
+    assertEquals(2, l.size());
+  }
+
+  /**
+   * Read contents of fetchlist.
+   * @param fetchlist  path to Generated fetchlist
+   * @return Generated {@link URLCrawlDatum} objects
+   * @throws IOException
+   */
+  private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws 
IOException {
+    // verify results
+    SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+
+    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value)) {
+        break READ;
+      }
+      l.add(new URLCrawlDatum(key, value));
+    } while (true);
+
+    reader.close();
+    return l;
+  }
+}

svn commit: r1440266 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbFilter.java src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java

Reply via email to