Author: snagel
Date: Wed Apr  9 22:06:27 2014
New Revision: 1586162

URL: http://svn.apache.org/r1586162
Log:
NUTCH-1733 parse-html to support HTML5 charset definitions

Added:
    
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
   (with props)
Modified:
    nutch/branches/2.x/CHANGES.txt
    
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1586162&r1=1586161&r2=1586162&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Apr  9 22:06:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1733 parse-html to support HTML5 charset definitions (snagel)
+
 * NUTCH-1727 Configurable length for Tlds (Sertac TURKEL via lewismc)
 
 * NUTCH-1738 Expose number of URLs generated per batch in GeneratorJob (Talat 
UYARER via lewismc)

Modified: 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1586162&r1=1586161&r2=1586162&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Wed Apr  9 22:06:27 2014
@@ -75,6 +75,9 @@ public class HtmlParser implements Parse
   private static Pattern charsetPattern =
     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
         Pattern.CASE_INSENSITIVE);
+  private static Pattern charsetPatternHTML5 =
+                 
Pattern.compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
+                                 Pattern.CASE_INSENSITIVE);
 
   private static Collection<WebPage.Field> FIELDS = new 
HashSet<WebPage.Field>();
 
@@ -86,15 +89,16 @@ public class HtmlParser implements Parse
 
   /**
    * Given a <code>ByteBuffer</code> representing an html file of an
-   * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag
+   * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag  
 
    * from the first <code>CHUNK_SIZE</code> bytes.
    * If there's no meta tag for Content-Type or no charset is specified,
+   * the content is checked for a Unicode Byte Order Mark (BOM).
+   * This will also cover non-byte oriented character encodings (UTF-16 only).
+   * If no character set can be determined,
    * <code>null</code> is returned.  <br />
-   * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
-   * can't be handled with this.
-   * We need to do something similar to what's done by mozilla
-   * 
(http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
-   * See also http://www.w3.org/TR/REC-xml/#sec-guessing
+   * See also 
http://www.w3.org/International/questions/qa-html-encoding-declarations,
+   * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
+   * http://www.w3.org/TR/REC-xml/#sec-guessing
    * <br />
    *
    * @param content <code>ByteBuffer</code> representation of an html file
@@ -123,6 +127,30 @@ public class HtmlParser implements Parse
       if (charsetMatcher.find())
         encoding = new String(charsetMatcher.group(1));
     }
+    if (encoding == null) {
+      // check for HTML5 meta charset
+      metaMatcher = charsetPatternHTML5.matcher(str);
+      if (metaMatcher.find()) {
+        encoding = new String(metaMatcher.group(1));
+      }
+    }
+    if (encoding == null) {
+      // check for BOM
+       if (length >= 3
+          && content.get(0) == (byte) 0xEF
+          && content.get(1) == (byte) 0xBB
+          && content.get(2) == (byte) 0xBF) {
+        encoding = "UTF-8";
+      } else if (length >= 2) {
+        if (content.get(0) == (byte)0xFF
+            && content.get(1) == (byte)0xFE) {
+          encoding = "UTF-16LE";
+        } else if (content.get(0) == (byte)0xFE
+            && content.get(1) == (byte)0xFF) {
+          encoding = "UTF-16BE";
+        }
+      }
+    }
 
     return encoding;
   }

Added: 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1586162&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 Wed Apr  9 22:06:27 2014
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.junit.Before;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class TestHtmlParser {
+
+  public static final Logger LOG = 
LoggerFactory.getLogger(TestHtmlParser.class);
+
+  private static final String encodingTestKeywords = 
+      "français, español, русский язык, čeština, 
ελληνικά";
+  private static final String encodingTestBody =
+      "<ul>\n  <li>français\n  <li>español\n  <li>русский язык\n  
<li>čeština\n  <li>ελληνικά\n</ul>";
+  private static final String encodingTestContent =
+      "<title>" + encodingTestKeywords + "</title>\n"
+          + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + 
"</meta>\n"
+          + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+  private static String[][] encodingTestPages= {
+    { 
+      "HTML4, utf-8, meta http-equiv, no quotes",
+      "utf-8",
+      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+          + "\"http://www.w3.org/TR/html4/loose.dtd\";>\n"
+          + "<html>\n<head>\n"
+          + "<meta http-equiv=Content-Type content=\"text/html; 
charset=utf-8\" />"
+          + encodingTestContent
+    },
+    { 
+      "HTML4, utf-8, meta http-equiv, single quotes",
+      "utf-8",
+      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+          + "\"http://www.w3.org/TR/html4/loose.dtd\";>\n"
+          + "<html>\n<head>\n"
+          + "<meta http-equiv='Content-Type' content='text/html; 
charset=utf-8' />"
+          + encodingTestContent
+    },
+    { 
+      "XHTML, utf-8, meta http-equiv, double quotes",
+      "utf-8",
+      "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\";>"
+          + "<html>\n<head>\n"
+          + "<meta http-equiv=\"Content-Type\" content=\"text/html; 
charset=utf-8\" />"
+          + encodingTestContent
+    },
+    { 
+      "HTML5, utf-8, meta charset",
+      "utf-8",
+      "<!DOCTYPE html>\n<html>\n<head>\n"
+          + "<meta charset=\"utf-8\">"
+          + encodingTestContent
+    },
+    { 
+      "HTML5, utf-8, BOM",
+      "utf-8",
+      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
+          + encodingTestContent
+    },
+    { 
+      "HTML5, utf-16, BOM",
+      "utf-16",
+      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
+          + encodingTestContent
+    }
+  };
+  
+  private Configuration conf;
+  private Parser parser;
+  
+  private static final String dummyUrl = "http://dummy.url/";;
+
+  
+  @Before
+  public void setup() {
+    conf = NutchConfiguration.create();
+    parser = new HtmlParser();
+    parser.setConf(conf);
+  }
+
+  protected WebPage page(byte[] contentBytes) {
+    WebPage page = new WebPage();
+    page.setBaseUrl(new Utf8(dummyUrl));
+    page.setContent(ByteBuffer.wrap(contentBytes));
+    page.setContentType(new Utf8("text/html"));
+    return page;
+  }
+  
+  protected Parse parse(WebPage page) {
+    return parser.getParse(dummyUrl, page);
+  }
+
+
+  @Test
+  public void testEncodingDetection() {
+    for (String[] testPage : encodingTestPages) {
+      String name = testPage[0];
+      Charset charset = Charset.forName(testPage[1]);
+      byte[] contentBytes = testPage[2].getBytes(charset);
+      //Parse parse = parse(contentBytes);
+      WebPage page = page(contentBytes);
+      Parse parse = parse(page);
+      String text = parse.getText();
+      String title = parse.getTitle();
+      //String keywords = parse.getMeta("keywords");
+      String keywords = Bytes.toString(page
+          .getFromMetadata(new Utf8("keywords")));
+      LOG.info(name);
+      LOG.info("title:\t" + title);
+      LOG.info("keywords:\t" + keywords);
+      LOG.info("text:\t" + text);
+      assertEquals("Title not extracted properly (" + name + ")",
+          encodingTestKeywords, title);
+      for (String keyword : encodingTestKeywords.split(",\\s*")) {
+        assertTrue(keyword + " not found in text (" + name + ")",
+            text.contains(keyword));
+      }
+      if (keywords != null) {
+        assertEquals("Keywords not extracted properly (" + name + ")",
+            encodingTestKeywords, keywords);
+      }
+    }
+  }
+
+}

Propchange: 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to