Repository: nutch Updated Branches: refs/heads/master 8572fd955 -> 6d2bfa986
NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 - generate base64 encoded string directly from content bytes (patch provided by Federico Bonelli) - add JUnit test to test indexing base64 encoded binary content with UTF-8, ISO-8859-1 and ISO-8859-2 character sets Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6d2bfa98 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6d2bfa98 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6d2bfa98 Branch: refs/heads/master Commit: 6d2bfa98635d8055d56dbe2597efc953f420ed5a Parents: 8572fd9 Author: Sebastian Nagel <[email protected]> Authored: Mon Apr 25 14:40:44 2016 +0200 Committer: Sebastian Nagel <[email protected]> Committed: Wed Apr 27 22:49:47 2016 +0200 ---------------------------------------------------------------------- CHANGES.txt | 2 + .../apache/nutch/indexer/IndexerMapReduce.java | 12 +- .../apache/nutch/indexer/NutchIndexAction.java | 3 + .../nutch/indexer/TestIndexerMapReduce.java | 187 +++++++++++++++++++ 4 files changed, 198 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index e14d7c5..6173134 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch Nutch Change Log +* NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 (Federico Bonelli, snagel) + * NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme Gowda N.,lewismc via mattmann) * NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model (bhavyasanghavi via sujen) http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/IndexerMapReduce.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index 1d5f66f..5025525 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -350,14 +350,14 @@ public class IndexerMapReduce extends Configured implements } if (content != null) { - // Get the original unencoded content - String binary = new String(content.getContent()); - - // optionally encode as base64 + // Add the original binary content + String binary; if (base64) { - binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary)); + // optionally encode as base64 + binary = Base64.encodeBase64String(content.getContent()); + } else { + binary = new String(content.getContent()); } - doc.add("binaryContent", binary); } http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/NutchIndexAction.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/indexer/NutchIndexAction.java b/src/java/org/apache/nutch/indexer/NutchIndexAction.java index 679d784..b2517c3 100644 --- a/src/java/org/apache/nutch/indexer/NutchIndexAction.java +++ b/src/java/org/apache/nutch/indexer/NutchIndexAction.java @@ -37,6 +37,9 @@ public class NutchIndexAction implements Writable { public NutchDocument doc = null; public byte action = ADD; + protected NutchIndexAction() { + } + public NutchIndexAction(NutchDocument doc, byte action) { this.doc = doc; this.action = action; http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java new file mode 100644 index 0000000..d581a0f --- /dev/null +++ b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java @@ -0,0 +1,187 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer; + +import org.apache.commons.codec.binary.Base64; +import org.apache.hadoop.mrunit.ReduceDriver; +import org.apache.hadoop.mrunit.types.Pair; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.Reducer; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; + +/** Test {@link IndexerMapReduce} */ +public class TestIndexerMapReduce { + + private static final Logger LOG = LoggerFactory + .getLogger(TestIndexerMapReduce.class); + + public static String testUrl = "http://nutch.apache.org/"; + public static Text testUrlText = new Text(testUrl); + public static String htmlContentType = "text/html"; + public static String testHtmlDoc = "<!DOCTYPE html>\n" + + "<html>\n" + + "<head>\n" + + "<title>Test Indexing Binary Content</title>\n" + + "<meta charset=\"utf-8\">\n" + + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" />\n" + + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des caractères\" />\n" + + "<meta name=\"keywords\" lang=\"cs\" content=\"kódovánà znaků\" />\n" + + "</head>\n" + + "<body>\n" + + "<p>\n" + + "<ul>\n" + + " <li lang=\"en\">English: character set, encoding\n" + + " <li lang=\"fr\">Français: codage des caractères\n" + + " <li lang=\"cs\">ÄeÅ¡tina: kódovánà znaků (not covered by Latin-1)\n" + + "</ul>\n" + + "</body>\n" + + "</html>"; + public static Metadata htmlMeta = new Metadata(); + static { + htmlMeta.add("Content-Type", "text/html"); + // add segment and signature to avoid NPEs + htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123"); + htmlMeta.add(Nutch.SIGNATURE_KEY, "123"); + } + public static ParseText parseText = new ParseText("Test"); + public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, + "Test", new Outlink[] {}, htmlMeta); + public static CrawlDatum crawlDatumDbFetched = new CrawlDatum( + CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24); + public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum( + CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24); + + private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new IndexerMapReduce(); + private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> reduceDriver; + private Configuration configuration; + + + /** + * Test indexing of base64-encoded binary content. + */ + @Test + public void testBinaryContentBase64() { + configuration = NutchConfiguration.create(); + configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true); + + Charset[] testCharsets = { StandardCharsets.UTF_8, + Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") }; + for (Charset charset : testCharsets) { + LOG.info("Testing indexing binary content as base64 for charset {}", + charset.name()); + + String htmlDoc = testHtmlDoc; + if (charset != StandardCharsets.UTF_8) { + htmlDoc = htmlDoc.replaceAll("utf-8", charset.name()); + if (charset.name().equalsIgnoreCase("iso-8859-1")) { + // Western-European character set: remove Czech content + htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", ""); + } else if (charset.name().equalsIgnoreCase("iso-8859-2")) { + // Eastern-European character set: remove French content + htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", ""); + } + } + + Content content = new Content(testUrl, testUrl, + htmlDoc.getBytes(charset), htmlContentType, htmlMeta, + configuration); + + NutchDocument doc = runIndexer(crawlDatumDbFetched, + crawlDatumFetchSuccess, parseText, parseData, content); + assertNotNull("No NutchDocument indexed", doc); + + String binaryContentBase64 = (String) doc.getField("binaryContent") + .getValues().get(0); + LOG.info("binary content (base64): {}", binaryContentBase64); + String binaryContent = new String( + Base64.decodeBase64(binaryContentBase64), charset); + LOG.info("binary content (decoded): {}", binaryContent); + assertEquals( + "Binary content (" + charset + ") not correctly saved as base64", + htmlDoc, binaryContent); + } + } + + /** + * Run {@link IndexerMapReduce.reduce(...)} to get a "indexed" + * {@link NutchDocument} by passing objects from segment and CrawlDb to the + * indexer. + * + * @param dbDatum + * crawl datum from CrawlDb + * @param fetchDatum + * crawl datum (fetch status) from segment + * @param parseText + * plain text from parsed document + * @param parseData + * parse data + * @param content + * (optional, if index binary content) protocol content + * @return "indexed" document + */ + public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, + ParseText parseText, ParseData parseData, Content content) { + List<NutchWritable> values = new ArrayList<NutchWritable>(); + values.add(new NutchWritable(dbDatum)); + values.add(new NutchWritable(fetchDatum)); + values.add(new NutchWritable(parseText)); + values.add(new NutchWritable(parseData)); + values.add(new NutchWritable(content)); + reduceDriver = ReduceDriver.newReduceDriver(reducer); + reduceDriver.setConfiguration(configuration); + reduceDriver.withInput(testUrlText, values); + List<Pair<Text, NutchIndexAction>> reduceResult; + NutchDocument doc = null; + try { + reduceResult = reduceDriver.run(); + for (Pair<Text, NutchIndexAction> p : reduceResult) { + if (p.getSecond().action != NutchIndexAction.DELETE) { + doc = p.getSecond().doc; + } + } + } catch (IOException e) { + LOG.error(StringUtils.stringifyException(e)); + } + return doc; + } + +}
