This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new d565f45 NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding - catch IllegalArgumentException when unescaping percent-encoding in URLs - if one URL of two compared URLs is valid, keep it as non-duplicate - add unit tests for DeduplicationJob d565f45 is described below commit d565f45a67d2491b7b536ae95560522aa20b8c26 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Fri Jan 14 10:34:22 2022 +0100 NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding - catch IllegalArgumentException when unescaping percent-encoding in URLs - if one URL of two compared URLs is valid, keep it as non-duplicate - add unit tests for DeduplicationJob --- .../org/apache/nutch/crawl/DeduplicationJob.java | 60 ++++---- .../nutch/crawl/TestCrawlDbDeduplication.java | 163 +++++++++++++++++++++ .../current/part-r-00000/.data.crc | Bin 0 -> 32 bytes .../current/part-r-00000/.index.crc | Bin 0 -> 12 bytes .../current/part-r-00000/data | Bin 0 -> 2604 bytes .../current/part-r-00000/index | Bin 0 -> 233 bytes 6 files changed, 196 insertions(+), 27 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index 7751366..5f1172d 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map; @@ -30,6 +31,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.CounterGroup; import org.apache.hadoop.mapreduce.Job; @@ -59,15 +61,16 @@ import org.slf4j.LoggerFactory; * with the latest timestamp is kept. If the documents have the same timestamp * then the one with the shortest URL is kept. The documents marked as duplicate * can then be deleted with the command CleaningJob. - ***/ + */ public class DeduplicationJob extends NutchTool implements Tool { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); - private final static Text urlKey = new Text("_URLTEMPKEY_"); - private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; - private final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order"; + protected final static Text urlKey = new Text("_URLTEMPKEY_"); + protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; + protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order"; + protected final static String UTF_8 = StandardCharsets.UTF_8.toString(); public static class DBFilter extends Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> { @@ -76,13 +79,12 @@ public class DeduplicationJob extends NutchTool implements Tool { @Override public void setup(Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum>.Context context) { - Configuration arg0 = context.getConfiguration(); - groupMode = arg0.get(DEDUPLICATION_GROUP_MODE); + Configuration conf = context.getConfiguration(); + groupMode = conf.get(DEDUPLICATION_GROUP_MODE); } @Override - public void map(Text key, CrawlDatum value, - Context context) + public void map(Text key, CrawlDatum value, Context context) throws IOException, InterruptedException { if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED @@ -121,18 +123,19 @@ public class DeduplicationJob extends NutchTool implements Tool { } } - public static class DedupReducer extends - Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> { + public static class DedupReducer<K extends Writable> + extends Reducer<K, CrawlDatum, Text, CrawlDatum> { - private String[] compareOrder; + protected String[] compareOrder; @Override - public void setup(Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum>.Context context) { + public void setup( + Reducer<K, CrawlDatum, Text, CrawlDatum>.Context context) { Configuration conf = context.getConfiguration(); compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(","); } - private void writeOutAsDuplicate(CrawlDatum datum, + protected void writeOutAsDuplicate(CrawlDatum datum, Context context) throws IOException, InterruptedException { datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE); @@ -143,8 +146,8 @@ public class DeduplicationJob extends NutchTool implements Tool { } @Override - public void reduce(BytesWritable key, Iterable<CrawlDatum> values, - Context context) throws IOException, InterruptedException { + public void reduce(K key, Iterable<CrawlDatum> values, Context context) + throws IOException, InterruptedException { CrawlDatum existingDoc = null; for (CrawlDatum newDoc : values) { @@ -164,8 +167,7 @@ public class DeduplicationJob extends NutchTool implements Tool { } } - private CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) - throws IOException { + protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) { for (int i = 0; i < compareOrder.length; i++) { switch (compareOrder[i]) { case "score": @@ -203,17 +205,21 @@ public class DeduplicationJob extends NutchTool implements Tool { } break; case "urlLength": - // same time? keep the one which has the shortest URL - String urlExisting; - String urlnewDoc; + // keep the one which has the shortest URL + // normalized by decoding percent-encoded sequences + String urlExisting = existingDoc.getMetaData().get(urlKey).toString(); + String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); + try { + urlExisting = URLDecoder.decode(urlExisting, UTF_8); + } catch (UnsupportedEncodingException | IllegalArgumentException e) { + LOG.error("Error decoding: {}", urlExisting, e); + // use the encoded URL + } try { - urlExisting = URLDecoder.decode( - existingDoc.getMetaData().get(urlKey).toString(), "UTF8"); - urlnewDoc = URLDecoder - .decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8"); - } catch (UnsupportedEncodingException e) { - LOG.error("Error decoding: " + urlKey); - throw new IOException("UnsupportedEncodingException for " + urlKey); + urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8); + } catch (UnsupportedEncodingException | IllegalArgumentException e) { + LOG.error("Error decoding: {}", urlnewDoc, e); + // use the encoded URL } if (urlExisting.length() < urlnewDoc.length()) { // mark new one as duplicate diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java new file mode 100644 index 0000000..25850a9 --- /dev/null +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java @@ -0,0 +1,163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.File; +import java.io.IOException; +import java.lang.invoke.MethodHandles; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestCrawlDbDeduplication { + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + Configuration conf; + FileSystem fs; + String testDir; + Path testCrawlDb; + CrawlDbReader reader; + + @Before + public void setUp() throws Exception { + conf = NutchConfiguration.create(); + fs = FileSystem.get(conf); + testDir = "test-crawldb-" + new java.util.Random().nextInt(); + File sampleCrawlDb = new File(System.getProperty("test.build.data", "."), + "deduplication-crawldb"); + LOG.info("Copying CrawlDb {} into test directory {}", sampleCrawlDb, + testDir); + FileUtils.copyDirectory(sampleCrawlDb, new File(testDir)); + testCrawlDb = new Path(testDir); + for (FileStatus s : fs.listStatus(testCrawlDb)) { + LOG.info("{}", s); + } + reader = new CrawlDbReader(); + } + + @After + public void tearDown() { + try { + if (fs.exists(testCrawlDb)) + fs.delete(testCrawlDb, true); + } catch (Exception e) { + } + try { + reader.close(); + } catch (Exception e) { + } + } + + @Test + public void testDeduplication() throws Exception { + String[] args = new String[3]; + args[0] = testCrawlDb.toString(); + args[1] = "-compareOrder"; + args[2] = "fetchTime,urlLength,score"; + int result = ToolRunner.run(conf, new DeduplicationJob(), args); + Assert.assertEquals("DeduplicationJob did not succeed", 0, result); + String url1 = "http://nutch.apache.org/"; + String url2 = "https://nutch.apache.org/"; + // url1 has been fetched earlier, so it should "survive" as "db_fetched": + checkStatus(url1, CrawlDatum.STATUS_DB_FETCHED); + checkStatus(url2, CrawlDatum.STATUS_DB_DUPLICATE); + } + + @Test + public void testDeduplicationHttpsOverHttp() throws Exception { + String[] args = new String[3]; + args[0] = testCrawlDb.toString(); + args[1] = "-compareOrder"; + args[2] = "httpsOverHttp,fetchTime,urlLength,score"; + int result = ToolRunner.run(conf, new DeduplicationJob(), args); + Assert.assertEquals("DeduplicationJob did not succeed", 0, result); + String url1 = "http://nutch.apache.org/"; + String url2 = "https://nutch.apache.org/"; + // url2 is https://, so it should "survive" as "db_fetched": + checkStatus(url1, CrawlDatum.STATUS_DB_DUPLICATE); + checkStatus(url2, CrawlDatum.STATUS_DB_FETCHED); + } + + private void checkStatus(String url, byte status) throws IOException { + CrawlDatum datum = reader.get(testCrawlDb.toString(), url, conf); + Assert.assertNotNull("No CrawlDatum found in CrawlDb for " + url, datum); + Assert.assertEquals( + "Expected status for " + url + ": " + CrawlDatum.getStatusName(status), + status, datum.getStatus()); + } + + static class TestDedupReducer extends DeduplicationJob.DedupReducer<Text> { + + void setCompareOrder(String compareOrder) { + this.compareOrder = compareOrder.split(","); + } + + String getDuplicate(String one, String two) { + CrawlDatum d1 = new CrawlDatum(); + d1.getMetaData().put(DeduplicationJob.urlKey, new Text(one)); + CrawlDatum d2 = new CrawlDatum(); + d2.getMetaData().put(DeduplicationJob.urlKey, new Text(two)); + CrawlDatum dup = getDuplicate(d1, d2); + if (dup == null) { + return null; + } + return dup.getMetaData().get(DeduplicationJob.urlKey).toString(); + } + } + + public String getDuplicateURL(String compareOrder, String url1, String url2) { + TestDedupReducer dedup = new TestDedupReducer(); + dedup.setCompareOrder(compareOrder); + return dedup.getDuplicate(url1, url2); + } + + @Test + public void testCompareURLs() { + // test same protocol, same length: no decision possible + String url0 = "https://example.com/"; + Assert.assertNull(getDuplicateURL("httpsOverHttp,urlLength", url0, url0)); + String url1 = "http://nutch.apache.org/"; + String url2 = "https://nutch.apache.org/"; + // test httpsOverHttp + Assert.assertEquals(url1, getDuplicateURL("httpsOverHttp", url1, url2)); + // test urlLength + Assert.assertEquals(url2, getDuplicateURL("urlLength", url1, url2)); + // test urlLength with percent-encoded URLs + // "b%C3%BCcher" (unescaped "bücher") is shorter than "buecher" + String url3 = "https://example.com/b%C3%BCcher"; + String url4 = "https://example.com/buecher"; + Assert.assertEquals(url4, getDuplicateURL("urlLength", url3, url4)); + // test NUTCH-2935: should not throw error on invalid percent-encoding + String url5 = "https://example.com/%YR"; + String url6 = "https://example.com/%YR%YR"; + Assert.assertEquals(url6, getDuplicateURL("urlLength", url5, url6)); + } + +} diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc b/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc new file mode 100644 index 0000000..d43f0ac Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc differ diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc b/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc new file mode 100644 index 0000000..9303568 Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc differ diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/data b/src/testresources/deduplication-crawldb/current/part-r-00000/data new file mode 100644 index 0000000..640a0d5 Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/data differ diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/index b/src/testresources/deduplication-crawldb/current/part-r-00000/index new file mode 100644 index 0000000..c1be1ec Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/index differ