nutch git commit: NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 - generate base64 encoded string directly from content bytes (patch provided by Federico Bonelli) - add JUnit test to test indexing base64 encoded binary co

snagel Wed, 27 Apr 2016 13:58:23 -0700

Repository: nutch
Updated Branches:
  refs/heads/master 8572fd955 -> 6d2bfa986



NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64
 - generate base64 encoded string directly from content bytes
   (patch provided by Federico Bonelli)
 - add JUnit test to test indexing base64 encoded binary content
   with UTF-8, ISO-8859-1 and ISO-8859-2 character sets


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6d2bfa98
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6d2bfa98
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6d2bfa98

Branch: refs/heads/master
Commit: 6d2bfa98635d8055d56dbe2597efc953f420ed5a
Parents: 8572fd9
Author: Sebastian Nagel <[email protected]>
Authored: Mon Apr 25 14:40:44 2016 +0200
Committer: Sebastian Nagel <[email protected]>
Committed: Wed Apr 27 22:49:47 2016 +0200

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../apache/nutch/indexer/IndexerMapReduce.java  |  12 +-
 .../apache/nutch/indexer/NutchIndexAction.java  |   3 +
 .../nutch/indexer/TestIndexerMapReduce.java     | 187 +++++++++++++++++++
 4 files changed, 198 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e14d7c5..6173134 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this 
CHANGES.txt for the Nutch
 
 Nutch Change Log
 
+* NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 
(Federico Bonelli, snagel)
+
 * NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme 
Gowda N.,lewismc via mattmann)
 
 * NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine 
Similarity Model (bhavyasanghavi via sujen)

http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 1d5f66f..5025525 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -350,14 +350,14 @@ public class IndexerMapReduce extends Configured 
implements
     }
 
     if (content != null) {
-      // Get the original unencoded content
-      String binary = new String(content.getContent());
-
-      // optionally encode as base64
+      // Add the original binary content
+      String binary;
       if (base64) {
-        binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary));
+        // optionally encode as base64
+        binary = Base64.encodeBase64String(content.getContent());
+      } else {
+        binary = new String(content.getContent());
       }
-
       doc.add("binaryContent", binary);
     }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/java/org/apache/nutch/indexer/NutchIndexAction.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/indexer/NutchIndexAction.java 
b/src/java/org/apache/nutch/indexer/NutchIndexAction.java
index 679d784..b2517c3 100644
--- a/src/java/org/apache/nutch/indexer/NutchIndexAction.java
+++ b/src/java/org/apache/nutch/indexer/NutchIndexAction.java
@@ -37,6 +37,9 @@ public class NutchIndexAction implements Writable {
   public NutchDocument doc = null;
   public byte action = ADD;
 
+  protected NutchIndexAction() {
+  }
+
   public NutchIndexAction(NutchDocument doc, byte action) {
     this.doc = doc;
     this.action = action;

http://git-wip-us.apache.org/repos/asf/nutch/blob/6d2bfa98/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java 
b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
new file mode 100644
index 0000000..d581a0f
--- /dev/null
+++ b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.hadoop.mrunit.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Reducer;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+
+/** Test {@link IndexerMapReduce} */
+public class TestIndexerMapReduce {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestIndexerMapReduce.class);
+
+  public static String testUrl = "http://nutch.apache.org/";;
+  public static Text testUrlText = new Text(testUrl);
+  public static String htmlContentType = "text/html";
+  public static String testHtmlDoc = "<!DOCTYPE html>\n"
+      + "<html>\n"
+      + "<head>\n"
+      + "<title>Test Indexing Binary Content</title>\n"
+      + "<meta charset=\"utf-8\">\n"
+      + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" 
/>\n"
+      + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des 
caractÃ¨res\" />\n"
+      + "<meta name=\"keywords\" lang=\"cs\" content=\"kÃ³dovÃ¡nÃ znakÅ¯\" 
/>\n"
+      + "</head>\n"
+      + "<body>\n"
+      + "<p>\n"
+      + "<ul>\n"
+      + "  <li lang=\"en\">English: character set, encoding\n"
+      + "  <li lang=\"fr\">FranÃ§ais: codage des caractÃ¨res\n"
+      + "  <li lang=\"cs\">ÄeÅ¡tina: kÃ³dovÃ¡nÃ znakÅ¯ (not covered by 
Latin-1)\n"
+      + "</ul>\n"
+      + "</body>\n"
+      + "</html>";
+  public static Metadata htmlMeta = new Metadata();
+  static {
+    htmlMeta.add("Content-Type", "text/html");
+    // add segment and signature to avoid NPEs
+    htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123");
+    htmlMeta.add(Nutch.SIGNATURE_KEY, "123");
+  }
+  public static ParseText parseText = new ParseText("Test");
+  public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+      "Test", new Outlink[] {}, htmlMeta);
+  public static CrawlDatum crawlDatumDbFetched = new CrawlDatum(
+      CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24);
+  public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum(
+      CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24);
+
+  private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new 
IndexerMapReduce();
+  private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> 
reduceDriver;
+  private Configuration configuration;
+
+
+  /**
+   * Test indexing of base64-encoded binary content.
+   */
+  @Test
+  public void testBinaryContentBase64() {
+    configuration = NutchConfiguration.create();
+    configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);
+
+    Charset[] testCharsets = { StandardCharsets.UTF_8,
+        Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") };
+    for (Charset charset : testCharsets) {
+      LOG.info("Testing indexing binary content as base64 for charset {}",
+          charset.name());
+
+      String htmlDoc = testHtmlDoc;
+      if (charset != StandardCharsets.UTF_8) {
+        htmlDoc = htmlDoc.replaceAll("utf-8", charset.name());
+        if (charset.name().equalsIgnoreCase("iso-8859-1")) {
+          // Western-European character set: remove Czech content
+          htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", "");
+        } else if (charset.name().equalsIgnoreCase("iso-8859-2")) {
+          // Eastern-European character set: remove French content
+          htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", "");
+        }
+      }
+
+      Content content = new Content(testUrl, testUrl,
+          htmlDoc.getBytes(charset), htmlContentType, htmlMeta,
+          configuration);
+
+      NutchDocument doc = runIndexer(crawlDatumDbFetched,
+          crawlDatumFetchSuccess, parseText, parseData, content);
+      assertNotNull("No NutchDocument indexed", doc);
+
+      String binaryContentBase64 = (String) doc.getField("binaryContent")
+          .getValues().get(0);
+      LOG.info("binary content (base64): {}", binaryContentBase64);
+      String binaryContent = new String(
+          Base64.decodeBase64(binaryContentBase64), charset);
+      LOG.info("binary content (decoded): {}", binaryContent);
+      assertEquals(
+          "Binary content (" + charset + ") not correctly saved as base64",
+          htmlDoc, binaryContent);
+    }
+  }
+
+  /**
+   * Run {@link IndexerMapReduce.reduce(...)} to get a &quot;indexed&quot;
+   * {@link NutchDocument} by passing objects from segment and CrawlDb to the
+   * indexer.
+   *
+   * @param dbDatum
+   *          crawl datum from CrawlDb
+   * @param fetchDatum
+   *          crawl datum (fetch status) from segment
+   * @param parseText
+   *          plain text from parsed document
+   * @param parseData
+   *          parse data
+   * @param content
+   *          (optional, if index binary content) protocol content
+   * @return &quot;indexed&quot; document
+   */
+  public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum,
+      ParseText parseText, ParseData parseData, Content content) {
+    List<NutchWritable> values = new ArrayList<NutchWritable>();
+    values.add(new NutchWritable(dbDatum));
+    values.add(new NutchWritable(fetchDatum));
+    values.add(new NutchWritable(parseText));
+    values.add(new NutchWritable(parseData));
+    values.add(new NutchWritable(content));
+    reduceDriver = ReduceDriver.newReduceDriver(reducer);
+    reduceDriver.setConfiguration(configuration);
+    reduceDriver.withInput(testUrlText, values);
+    List<Pair<Text, NutchIndexAction>> reduceResult;
+    NutchDocument doc = null;
+    try {
+      reduceResult = reduceDriver.run();
+      for (Pair<Text, NutchIndexAction> p : reduceResult) {
+        if (p.getSecond().action != NutchIndexAction.DELETE) {
+          doc = p.getSecond().doc;
+        }
+      }
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    return doc;
+  }
+
+}

nutch git commit: NUTCH-2254 Indexer: character set issue with -addBinaryContent and -base64 - generate base64 encoded string directly from content bytes (patch provided by Federico Bonelli) - add JUnit test to test indexing base64 encoded binary co

Reply via email to