[jira] [Commented] (NUTCH-2464) Headers That Contain HTML Elements Are Not Parsed

ASF GitHub Bot (JIRA) Thu, 30 Nov 2017 11:21:09 -0800

    [ 
https://issues.apache.org/jira/browse/NUTCH-2464?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16273208#comment-16273208
 ]


ASF GitHub Bot commented on NUTCH-2464:
---------------------------------------

sebastian-nagel closed pull request #244: Fix for NUTCH-2464 get textual 
content from nested heading nodes
URL: https://github.com/apache/nutch/pull/244
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/plugin/headings/ivy.xml b/src/plugin/headings/ivy.xml
index 5b8393b6b..b8482ffa9 100644
--- a/src/plugin/headings/ivy.xml
+++ b/src/plugin/headings/ivy.xml
@@ -36,6 +36,7 @@
   </publications>
 
   <dependencies>
+    <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" 
conf="test->master"/>
   </dependencies>
   
 </ivy-module>
diff --git 
a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 
b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
index e20a2da09..eaa2a7020 100644
--- 
a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
+++ 
b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
@@ -108,12 +108,13 @@ public Configuration getConf() {
    */
   protected static String getNodeValue(Node node) {
     StringBuilder buffer = new StringBuilder();
+    NodeWalker walker = new NodeWalker(node);
 
-    NodeList children = node.getChildNodes();
+    while (walker.hasNext()) {
+      final Node n = walker.nextNode();
 
-    for (int i = 0; i < children.getLength(); i++) {
-      if (children.item(i).getNodeType() == Node.TEXT_NODE) {
-        buffer.append(children.item(i).getNodeValue());
+      if (n.getNodeType() == Node.TEXT_NODE) {
+        buffer.append(n.getNodeValue());
       }
     }
 
diff --git 
a/src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
 
b/src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
new file mode 100644
index 000000000..125d7567f
--- /dev/null
+++ 
b/src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
@@ -0,0 +1,51 @@
+package org.apache.nutch.parse.headings;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+import org.w3c.dom.DocumentFragment;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestHeadingsParseFilter {
+  private static Configuration conf = NutchConfiguration.create();
+
+  @Test
+  public void testExtractHeadingFromNestedNodes()
+      throws IOException, SAXException {
+
+    conf.setStrings("headings", "h1", "h2");
+    HtmlParseFilter filter = new HeadingsParseFilter();
+    filter.setConf(conf);
+
+    Content content = new Content("http://www.foo.com/";, "http://www.foo.com/";,
+        "".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
+    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+    ParseResult parseResult = ParseResult
+        .createParseResult("http://www.foo.com/";, parse);
+    HTMLMetaTags metaTags = new HTMLMetaTags();
+    DOMFragmentParser parser = new DOMFragmentParser();
+    DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+    parser.parse(new InputSource(new ByteArrayInputStream(
+        ("<html><head><title>test header with span 
element</title></head><body><h1>header with <span>span 
element</span></h1></body></html>")
+            .getBytes())), node);
+
+    parseResult = filter.filter(content, parseResult, metaTags, node);
+
+    Assert.assertEquals(
+        "The h1 tag must include the content of the inner span node",
+        "header with span element",
+        parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
+  }
+}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Headers That Contain HTML Elements Are Not Parsed
> -------------------------------------------------
>
>                 Key: NUTCH-2464
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2464
>             Project: Nutch
>          Issue Type: Bug
>          Components: plugin
>    Affects Versions: 1.13
>         Environment: Internal development/test environments.
>            Reporter: Cass Pallansch
>         Attachments: NUTCH-2464-complex-header.html
>
>
> Nutch does not appear to traverse the HTML elements that may be contained 
> within header elements (e.g., H1, H2, H3, etc. tags).  Many times there are 
> anchors and/or <span> tags within these elements that contain the actual text 
> nodes that should be picked up as the header value for indexing purposes.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (NUTCH-2464) Headers That Contain HTML Elements Are Not Parsed

Reply via email to