Author: snagel
Date: Wed Apr 9 22:06:27 2014
New Revision: 1586162
URL: http://svn.apache.org/r1586162
Log:
NUTCH-1733 parse-html to support HTML5 charset definitions
Added:
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
(with props)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1586162&r1=1586161&r2=1586162&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Apr 9 22:06:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1733 parse-html to support HTML5 charset definitions (snagel)
+
* NUTCH-1727 Configurable length for Tlds (Sertac TURKEL via lewismc)
* NUTCH-1738 Expose number of URLs generated per batch in GeneratorJob (Talat
UYARER via lewismc)
Modified:
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1586162&r1=1586161&r2=1586162&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Wed Apr 9 22:06:27 2014
@@ -75,6 +75,9 @@ public class HtmlParser implements Parse
private static Pattern charsetPattern =
Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
Pattern.CASE_INSENSITIVE);
+ private static Pattern charsetPatternHTML5 =
+
Pattern.compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
+ Pattern.CASE_INSENSITIVE);
private static Collection<WebPage.Field> FIELDS = new
HashSet<WebPage.Field>();
@@ -86,15 +89,16 @@ public class HtmlParser implements Parse
/**
* Given a <code>ByteBuffer</code> representing an html file of an
- * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
+ * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
* from the first <code>CHUNK_SIZE</code> bytes.
* If there's no meta tag for Content-Type or no charset is specified,
+ * the content is checked for a Unicode Byte Order Mark (BOM).
+ * This will also cover non-byte oriented character encodings (UTF-16 only).
+ * If no character set can be determined,
* <code>null</code> is returned. <br />
- * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
- * can't be handled with this.
- * We need to do something similar to what's done by mozilla
- *
(http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
- * See also http://www.w3.org/TR/REC-xml/#sec-guessing
+ * See also
http://www.w3.org/International/questions/qa-html-encoding-declarations,
+ * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
+ * http://www.w3.org/TR/REC-xml/#sec-guessing
* <br />
*
* @param content <code>ByteBuffer</code> representation of an html file
@@ -123,6 +127,30 @@ public class HtmlParser implements Parse
if (charsetMatcher.find())
encoding = new String(charsetMatcher.group(1));
}
+ if (encoding == null) {
+ // check for HTML5 meta charset
+ metaMatcher = charsetPatternHTML5.matcher(str);
+ if (metaMatcher.find()) {
+ encoding = new String(metaMatcher.group(1));
+ }
+ }
+ if (encoding == null) {
+ // check for BOM
+ if (length >= 3
+ && content.get(0) == (byte) 0xEF
+ && content.get(1) == (byte) 0xBB
+ && content.get(2) == (byte) 0xBF) {
+ encoding = "UTF-8";
+ } else if (length >= 2) {
+ if (content.get(0) == (byte)0xFF
+ && content.get(1) == (byte)0xFE) {
+ encoding = "UTF-16LE";
+ } else if (content.get(0) == (byte)0xFE
+ && content.get(1) == (byte)0xFF) {
+ encoding = "UTF-16BE";
+ }
+ }
+ }
return encoding;
}
Added:
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1586162&view=auto
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
(added)
+++
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
Wed Apr 9 22:06:27 2014
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.junit.Before;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class TestHtmlParser {
+
+ public static final Logger LOG =
LoggerFactory.getLogger(TestHtmlParser.class);
+
+ private static final String encodingTestKeywords =
+ "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina,
ελληνικά";
+ private static final String encodingTestBody =
+ "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n
<li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>";
+ private static final String encodingTestContent =
+ "<title>" + encodingTestKeywords + "</title>\n"
+ + "<meta name=\"keywords\" content=\"" + encodingTestKeywords +
"</meta>\n"
+ + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+ private static String[][] encodingTestPages= {
+ {
+ "HTML4, utf-8, meta http-equiv, no quotes",
+ "utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=Content-Type content=\"text/html;
charset=utf-8\" />"
+ + encodingTestContent
+ },
+ {
+ "HTML4, utf-8, meta http-equiv, single quotes",
+ "utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv='Content-Type' content='text/html;
charset=utf-8' />"
+ + encodingTestContent
+ },
+ {
+ "XHTML, utf-8, meta http-equiv, double quotes",
+ "utf-8",
+ "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=\"Content-Type\" content=\"text/html;
charset=utf-8\" />"
+ + encodingTestContent
+ },
+ {
+ "HTML5, utf-8, meta charset",
+ "utf-8",
+ "<!DOCTYPE html>\n<html>\n<head>\n"
+ + "<meta charset=\"utf-8\">"
+ + encodingTestContent
+ },
+ {
+ "HTML5, utf-8, BOM",
+ "utf-8",
+ "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
+ + encodingTestContent
+ },
+ {
+ "HTML5, utf-16, BOM",
+ "utf-16",
+ "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
+ + encodingTestContent
+ }
+ };
+
+ private Configuration conf;
+ private Parser parser;
+
+ private static final String dummyUrl = "http://dummy.url/";
+
+
+ @Before
+ public void setup() {
+ conf = NutchConfiguration.create();
+ parser = new HtmlParser();
+ parser.setConf(conf);
+ }
+
+ protected WebPage page(byte[] contentBytes) {
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(dummyUrl));
+ page.setContent(ByteBuffer.wrap(contentBytes));
+ page.setContentType(new Utf8("text/html"));
+ return page;
+ }
+
+ protected Parse parse(WebPage page) {
+ return parser.getParse(dummyUrl, page);
+ }
+
+
+ @Test
+ public void testEncodingDetection() {
+ for (String[] testPage : encodingTestPages) {
+ String name = testPage[0];
+ Charset charset = Charset.forName(testPage[1]);
+ byte[] contentBytes = testPage[2].getBytes(charset);
+ //Parse parse = parse(contentBytes);
+ WebPage page = page(contentBytes);
+ Parse parse = parse(page);
+ String text = parse.getText();
+ String title = parse.getTitle();
+ //String keywords = parse.getMeta("keywords");
+ String keywords = Bytes.toString(page
+ .getFromMetadata(new Utf8("keywords")));
+ LOG.info(name);
+ LOG.info("title:\t" + title);
+ LOG.info("keywords:\t" + keywords);
+ LOG.info("text:\t" + text);
+ assertEquals("Title not extracted properly (" + name + ")",
+ encodingTestKeywords, title);
+ for (String keyword : encodingTestKeywords.split(",\\s*")) {
+ assertTrue(keyword + " not found in text (" + name + ")",
+ text.contains(keyword));
+ }
+ if (keywords != null) {
+ assertEquals("Keywords not extracted properly (" + name + ")",
+ encodingTestKeywords, keywords);
+ }
+ }
+ }
+
+}
Propchange:
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
------------------------------------------------------------------------------
svn:eol-style = native