Author: markus Date: Tue Feb 16 13:39:18 2016 New Revision: 1730687 URL: http://svn.apache.org/viewvc?rev=1730687&view=rev Log: NUTCH-1233 Rely on Tika for outlink extraction
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730687&r1=1730686&r2=1730687&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 16 13:39:18 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-1233 Rely on Tika for outlink extraction (markus) + * NUTCH-2210 Upgrade to Tika 1.12 (markus) * NUTCH-2209 Improved Tokenization for Similarity Scoring plugin (Sujen) Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1730687&r1=1730686&r2=1730687&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Tue Feb 16 13:39:18 2016 @@ -355,7 +355,9 @@ class DOMBuilder implements ContentHandl */ public void endElement(String ns, String localName, String name) throws org.xml.sax.SAXException { - m_elemStack.pop(); + if (!m_elemStack.isEmpty()) { + m_elemStack.pop(); + } m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek(); } Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1730687&r1=1730686&r2=1730687&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Tue Feb 16 13:39:18 2016 @@ -22,11 +22,14 @@ import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.Outlink; import org.apache.nutch.util.NodeWalker; import org.apache.nutch.util.URLUtil; +import org.apache.tika.sax.Link; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @@ -57,6 +60,7 @@ public class DOMContentUtils { } private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); + private HashSet<String> ignoredTags = new HashSet<String>(); private Configuration conf; public DOMContentUtils(Configuration conf) { @@ -85,6 +89,7 @@ public class DOMContentUtils { // remove unwanted link tags from the linkParams map String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { + ignoredTags.add(ignoreTags[i].toLowerCase()); if (!forceTags.contains(ignoreTags[i])) linkParams.remove(ignoreTags[i]); } @@ -244,7 +249,7 @@ public class DOMContentUtils { } return true; } - + // this only covers a few cases of empty links that are symptomatic // of nekohtml's DOM-fixup process... private boolean shouldThrowAwayLink(Node node, NodeList children, @@ -365,5 +370,33 @@ public class DOMContentUtils { } } } - -} + + // This one is used by NUTCH-1918 + public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) { + String target = null; + String anchor = null; + boolean noFollow = false; + + for (Link link : tikaExtractedOutlinks) { + target = link.getUri(); + noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false; + anchor = link.getText(); + + if (!ignoredTags.contains(link.getType())) { + if (target != null && !noFollow) { + try { + URL url = URLUtil.resolveURL(base, target); + + // clean the anchor + anchor = anchor.replaceAll("\\s+", " "); + anchor = anchor.trim(); + + outlinks.add(new Outlink(url.toString(), anchor)); + } catch (MalformedURLException e) { + // don't care + } + } + } + } + } +} \ No newline at end of file Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1730687&r1=1730686&r2=1730687&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Tue Feb 16 13:39:18 2016 @@ -20,6 +20,7 @@ import java.io.ByteArrayInputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; @@ -43,9 +44,13 @@ import org.apache.tika.parser.ParseConte import org.apache.tika.parser.Parser; import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.sax.Link; +import org.apache.tika.sax.LinkContentHandler; +import org.apache.tika.sax.TeeContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; +import org.xml.sax.ContentHandler; /** * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML @@ -95,16 +100,20 @@ public class TikaParser implements org.a HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); + DOMBuilder domhandler = new DOMBuilder(doc, root); + LinkContentHandler linkContentHandler = new LinkContentHandler(); domhandler.setUpperCaseElementNames(upperCaseElementNames); domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML); ParseContext context = new ParseContext(); + TeeContentHandler teeContentHandler = new TeeContentHandler(domhandler, linkContentHandler); + if (HTMLMapper != null) context.set(HtmlMapper.class, HTMLMapper); tikamd.set(Metadata.CONTENT_TYPE, mimeType); try { - parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context); + parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + content.getUrl(), e); return new ParseStatus(ParseStatus.FAILED, e.getMessage()) @@ -147,7 +156,12 @@ public class TikaParser implements org.a if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } - utils.getOutlinks(baseTag != null ? baseTag : base, l, root); + + // pre-1233 outlink extraction + //utils.getOutlinks(baseTag != null ? baseTag : base, l, root); + // Get outlinks from Tika + List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks(); + utils.getOutlinks(baseTag != null ? baseTag : base, l, tikaExtractedOutlinks); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in "