This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 8f692d13d45642f8b447d47af796f06487afeec2 Author: Sebastian Nagel <[email protected]> AuthorDate: Fri Dec 15 21:35:27 2017 +0100 NUTCH-2478 HTML parser should resolve base URL <base href=...> - finally fix parse-tika: - href attribute of base element dropped in DOM - need to call tikamd.get("Content-Location") - port HTML parser test from parse-html to parse-tika - add method to DomUtil which prints DocumentFragment --- src/java/org/apache/nutch/util/DomUtil.java | 9 +++++++++ .../java/org/apache/nutch/parse/html/HtmlParser.java | 13 ++++++++----- .../org/apache/nutch/parse/html/TestHtmlParser.java | 2 +- .../java/org/apache/nutch/parse/tika/TikaParser.java | 18 +++++++++++------- .../test/org/apache/nutch/tika}/TestHtmlParser.java | 10 +++++----- 5 files changed, 34 insertions(+), 18 deletions(-) diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java index e93477a..b4f0eac 100644 --- a/src/java/org/apache/nutch/util/DomUtil.java +++ b/src/java/org/apache/nutch/util/DomUtil.java @@ -31,7 +31,9 @@ import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.xerces.parsers.DOMParser; +import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; +import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -103,4 +105,11 @@ public class DomUtil { LOG.error("Error: ", ex); } } + + public static void saveDom(OutputStream os, DocumentFragment doc) { + NodeList docChildren = doc.getChildNodes(); + for (int i = 0; i < docChildren.getLength(); i++) { + saveDom(os, (Element) docChildren.item(i)); + } + } } diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index e940eb1..9ed9fa4 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -207,11 +207,14 @@ public class HtmlParser implements Parser { if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks - URL baseTag = null; - try { - baseTag = new URL(base, utils.getBase(root)); - } catch (MalformedURLException e) { - baseTag = base; + URL baseTag = base; + String baseTagHref = utils.getBase(root); + if (baseTagHref != null) { + try { + baseTag = new URL(base, baseTagHref); + } catch (MalformedURLException e) { + baseTag = base; + } } if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java index 8fe94e6..a4c8206 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java @@ -81,7 +81,7 @@ public class TestHtmlParser { "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; private static final String resolveBaseUrlTestContent = // - "<html>\\n<head>\n" + // + "<html>\n<head>\n" + // " <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + // " <base href=\"//www.example.com/\">\n" + // "</head>\n<body>\n" + // diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 1173504..ea864be 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -52,6 +52,7 @@ import org.apache.tika.sax.TeeContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; import org.xml.sax.ContentHandler; /** @@ -170,21 +171,24 @@ public class TikaParser implements org.apache.nutch.parse.Parser { if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks - URL baseTag = null; - try { - baseTag = new URL(base, utils.getBase(root)); - } catch (MalformedURLException e) { - baseTag = base; + URL baseTag = base; + String baseTagHref = tikamd.get("Content-Location"); + if (baseTagHref != null) { + try { + baseTag = new URL(base, baseTagHref); + } catch (MalformedURLException e) { + LOG.trace("Invalid <base href=\"{}\">", baseTagHref); + } } if (LOG.isTraceEnabled()) { - LOG.trace("Getting links..."); + LOG.trace("Getting links (base URL = {}) ...", baseTag); } // pre-1233 outlink extraction //utils.getOutlinks(baseTag != null ? baseTag : base, l, root); // Get outlinks from Tika List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks(); - utils.getOutlinks(baseTag, l, root); + utils.getOutlinks(baseTag, l, tikaExtractedOutlinks); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java similarity index 96% copy from src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java copy to src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java index 8fe94e6..d2bc816 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.nutch.parse.html; +package org.apache.nutch.tika; import java.lang.invoke.MethodHandles; import java.nio.charset.Charset; @@ -23,7 +23,7 @@ import java.nio.charset.StandardCharsets; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.html.HtmlParser; +import org.apache.nutch.parse.tika.TikaParser; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.Parser; @@ -81,7 +81,7 @@ public class TestHtmlParser { "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; private static final String resolveBaseUrlTestContent = // - "<html>\\n<head>\n" + // + "<html>\n<head>\n" + // " <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + // " <base href=\"//www.example.com/\">\n" + // "</head>\n<body>\n" + // @@ -93,8 +93,8 @@ public class TestHtmlParser { public TestHtmlParser() { conf = NutchConfiguration.create(); - conf.set("plugin.includes", "parse-html"); - parser = new HtmlParser(); + conf.set("plugin.includes", "parse-tika"); + parser = new TikaParser(); parser.setConf(conf); } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
