Author: lewismc
Date: Wed Jan 30 04:02:28 2013
New Revision: 1440266
URL: http://svn.apache.org/viewvc?rev=1440266&view=rev
Log:
NUTCH-1521 CrawlDbFilter pass null url to urlNormalizers
Added:
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1440266&r1=1440265&r2=1440266&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 30 04:02:28 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1521 CrawlDbFilter pass null url to urlNormalizers (Lufeng via lewismc)
+
* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas
Patil)
* NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?rev=1440266&r1=1440265&r2=1440266&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Wed Jan 30
04:02:28 2013
@@ -84,7 +84,7 @@ public class CrawlDbFilter implements Ma
if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
url = null;
}
- if (urlNormalizers) {
+ if (url != null && urlNormalizers) {
try {
url = normalizers.normalize(url, scope); // normalize the url
} catch (Exception e) {
Added: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java?rev=1440266&view=auto
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java (added)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java Wed Jan
30 04:02:28 2013
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+
+import junit.framework.TestCase;
+import org.apache.nutch.util.NutchJob;
+
+/**
+ * CrawlDbFiltering test which tests for correct, error free url
+ * normalization when the CrawlDB includes urls with <code>DB GONE</code>
status
+ * and <code>CRAWLDB_PURGE_404</code> is set to true.
+ *
+ * @author lufeng
+ */
+public class TestCrawlDbFilter extends TestCase {
+ Configuration conf;
+ Path dbDir;
+ Path newCrawlDb;
+ final static Path testdir = new Path("build/test/crawldbfilter-test");
+ FileSystem fs;
+
+ protected void setUp() throws Exception {
+ conf = CrawlDBTestUtil.createConfiguration();
+ fs = FileSystem.get(conf);
+ fs.delete(testdir, true);
+ }
+
+ protected void tearDown() {
+ delete(testdir);
+ }
+
+ private void delete(Path p) {
+ try {
+ fs.delete(p, true);
+ } catch (IOException e) {
+ }
+ }
+
+ /**
+ * Test url404Purging
+ *
+ * @throws Exception
+ */
+ public void testUrl404Purging() throws Exception {
+ // create a CrawlDatum with DB GONE status
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+ list.add(new URLCrawlDatum(new Text("http://www.example.com"), new
CrawlDatum(
+ CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
+ list.add(new URLCrawlDatum(new Text("http://www.example1.com"), new
CrawlDatum(
+ CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
+ list.add(new URLCrawlDatum(new Text("http://www.example2.com"), new
CrawlDatum(
+ CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
+ dbDir = new Path(testdir, "crawldb");
+ newCrawlDb = new Path(testdir,"newcrawldb");
+ // create crawldb
+ CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+ // set CRAWLDB_PURGE_404 to true
+ conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404,true);
+ conf.setBoolean(CrawlDbFilter.URL_NORMALIZING,true);
+ conf.setBoolean(CrawlDbFilter.URL_FILTERING,false);
+ conf.setInt("urlnormalizer.loop.count", 2);
+ JobConf job = new NutchJob(conf);
+ job.setJobName("Test CrawlDbFilter");
+ Path current = new Path(dbDir, "current");
+ if (FileSystem.get(job).exists(current)) {
+ FileInputFormat.addInputPath(job, current);
+ }
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setMapperClass(CrawlDbFilter.class);
+ job.setReducerClass(CrawlDbReducer.class);
+ FileOutputFormat.setOutputPath(job, newCrawlDb);
+ job.setOutputFormat(MapFileOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(CrawlDatum.class);
+ JobClient.runJob(job);
+
+ Path fetchlist = new Path(new Path(newCrawlDb,
+ "part-00000"), "data");
+
+ ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+ // verify we got right amount of records
+ assertEquals(2, l.size());
+ }
+
+ /**
+ * Read contents of fetchlist.
+ * @param fetchlist path to Generated fetchlist
+ * @return Generated {@link URLCrawlDatum} objects
+ * @throws IOException
+ */
+ private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws
IOException {
+ // verify results
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+
+ ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+ READ: do {
+ Text key = new Text();
+ CrawlDatum value = new CrawlDatum();
+ if (!reader.next(key, value)) {
+ break READ;
+ }
+ l.add(new URLCrawlDatum(key, value));
+ } while (true);
+
+ reader.close();
+ return l;
+ }
+}