Author: markus
Date: Tue Feb 16 13:39:18 2016
New Revision: 1730687
URL: http://svn.apache.org/viewvc?rev=1730687&view=rev
Log:
NUTCH-1233 Rely on Tika for outlink extraction
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730687&r1=1730686&r2=1730687&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 16 13:39:18 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1233 Rely on Tika for outlink extraction (markus)
+
* NUTCH-2210 Upgrade to Tika 1.12 (markus)
* NUTCH-2209 Improved Tokenization for Similarity Scoring plugin (Sujen)
Modified:
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1730687&r1=1730686&r2=1730687&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
(original)
+++
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
Tue Feb 16 13:39:18 2016
@@ -355,7 +355,9 @@ class DOMBuilder implements ContentHandl
*/
public void endElement(String ns, String localName, String name)
throws org.xml.sax.SAXException {
- m_elemStack.pop();
+ if (!m_elemStack.isEmpty()) {
+ m_elemStack.pop();
+ }
m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
}
Modified:
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1730687&r1=1730686&r2=1730687&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
(original)
+++
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
Tue Feb 16 13:39:18 2016
@@ -22,11 +22,14 @@ import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
import org.apache.nutch.util.URLUtil;
+import org.apache.tika.sax.Link;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -57,6 +60,7 @@ public class DOMContentUtils {
}
private HashMap<String, LinkParams> linkParams = new HashMap<String,
LinkParams>();
+ private HashSet<String> ignoredTags = new HashSet<String>();
private Configuration conf;
public DOMContentUtils(Configuration conf) {
@@ -85,6 +89,7 @@ public class DOMContentUtils {
// remove unwanted link tags from the linkParams map
String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+ ignoredTags.add(ignoreTags[i].toLowerCase());
if (!forceTags.contains(ignoreTags[i]))
linkParams.remove(ignoreTags[i]);
}
@@ -244,7 +249,7 @@ public class DOMContentUtils {
}
return true;
}
-
+
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
private boolean shouldThrowAwayLink(Node node, NodeList children,
@@ -365,5 +370,33 @@ public class DOMContentUtils {
}
}
}
-
-}
+
+ // This one is used by NUTCH-1918
+ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link>
tikaExtractedOutlinks) {
+ String target = null;
+ String anchor = null;
+ boolean noFollow = false;
+
+ for (Link link : tikaExtractedOutlinks) {
+ target = link.getUri();
+ noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true :
false;
+ anchor = link.getText();
+
+ if (!ignoredTags.contains(link.getType())) {
+ if (target != null && !noFollow) {
+ try {
+ URL url = URLUtil.resolveURL(base, target);
+
+ // clean the anchor
+ anchor = anchor.replaceAll("\\s+", " ");
+ anchor = anchor.trim();
+
+ outlinks.add(new Outlink(url.toString(), anchor));
+ } catch (MalformedURLException e) {
+ // don't care
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
Modified:
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1730687&r1=1730686&r2=1730687&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
(original)
+++
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Tue Feb 16 13:39:18 2016
@@ -20,6 +20,7 @@ import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
+import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
@@ -43,9 +44,13 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.Link;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
+import org.xml.sax.ContentHandler;
/**
* Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
@@ -95,16 +100,20 @@ public class TikaParser implements org.a
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
DocumentFragment root = doc.createDocumentFragment();
+
DOMBuilder domhandler = new DOMBuilder(doc, root);
+ LinkContentHandler linkContentHandler = new LinkContentHandler();
domhandler.setUpperCaseElementNames(upperCaseElementNames);
domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
ParseContext context = new ParseContext();
+ TeeContentHandler teeContentHandler = new TeeContentHandler(domhandler,
linkContentHandler);
+
if (HTMLMapper != null)
context.set(HtmlMapper.class, HTMLMapper);
tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
- parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
+ parser.parse(new ByteArrayInputStream(raw),
(ContentHandler)teeContentHandler, tikamd, context);
} catch (Exception e) {
LOG.error("Error parsing " + content.getUrl(), e);
return new ParseStatus(ParseStatus.FAILED, e.getMessage())
@@ -147,7 +156,12 @@ public class TikaParser implements org.a
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links...");
}
- utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+
+ // pre-1233 outlink extraction
+ //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+ // Get outlinks from Tika
+ List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
+ utils.getOutlinks(baseTag != null ? baseTag : base, l,
tikaExtractedOutlinks);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in "