http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java new file mode 100644 index 0000000..5c4c990 --- /dev/null +++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java @@ -0,0 +1,402 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.tika; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.util.NodeWalker; +import org.apache.nutch.util.URLUtil; +import org.apache.tika.sax.Link; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +/** + * A collection of methods for extracting content from DOM trees. + * + * This class holds a few utility methods for pulling content out of DOM nodes, + * such as getOutlinks, getText, etc. + * + */ +public class DOMContentUtils { + + private static class LinkParams { + private String elName; + private String attrName; + private int childLen; + + private LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } + } + + private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); + private HashSet<String> ignoredTags = new HashSet<String>(); + private Configuration conf; + + public DOMContentUtils(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + // forceTags is used to override configurable tag ignoring, later on + Collection<String> forceTags = new ArrayList<String>(1); + + this.conf = conf; + linkParams.clear(); + linkParams.put("a", new LinkParams("a", "href", 1)); + linkParams.put("area", new LinkParams("area", "href", 0)); + if (conf.getBoolean("parser.html.form.use_action", true)) { + linkParams.put("form", new LinkParams("form", "action", 1)); + if (conf.get("parser.html.form.use_action") != null) + forceTags.add("form"); + } + linkParams.put("frame", new LinkParams("frame", "src", 0)); + linkParams.put("iframe", new LinkParams("iframe", "src", 0)); + linkParams.put("script", new LinkParams("script", "src", 0)); + linkParams.put("link", new LinkParams("link", "href", 0)); + linkParams.put("img", new LinkParams("img", "src", 0)); + + // remove unwanted link tags from the linkParams map + String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); + for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { + ignoredTags.add(ignoreTags[i].toLowerCase()); + if (!forceTags.contains(ignoreTags[i])) + linkParams.remove(ignoreTags[i]); + } + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append all the content text found beneath the DOM node to the + * <code>StringBuffer</code>. + * + * <p> + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted + * and the <code>StringBuffer</code> will not contain any text encountered + * after a nested anchor is found. + * + * <p> + * + * @return true if nested anchors were found + */ + private boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + return true; + } + return false; + } + + /** + * This is a convinience method, equivalent to + * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * + */ + public void getText(StringBuffer sb, Node node) { + getText(sb, node, false); + } + + // returns true if abortOnNestedAnchors is true and we find nested + // anchors + private boolean getTextHelper(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, int anchorDepth) { + boolean abort = false; + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("script".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if ("style".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { + anchorDepth++; + if (anchorDepth > 1) { + abort = true; + break; + } + } + if (nodeType == Node.COMMENT_NODE) { + walker.skipChildren(); + } + if (nodeType == Node.TEXT_NODE) { + // cleanup and trim the value + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + text = text.trim(); + if (text.length() > 0) { + if (sb.length() > 0) + sb.append(' '); + sb.append(text); + } + } + } + + return abort; + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append the content text found beneath the first <code>title</code> node to + * the <code>StringBuffer</code>. + * + * @return true if a title node was found, false otherwise + */ + public boolean getTitle(StringBuffer sb, Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return false; + } + + if (nodeType == Node.ELEMENT_NODE) { + if ("title".equalsIgnoreCase(nodeName)) { + getText(sb, currentNode); + return true; + } + } + } + + return false; + } + + /** If Node contains a BASE tag then it's HREF is returned. */ + URL getBase(Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + // is this node a BASE tag? + if (nodeType == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return null; + } + + if ("base".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + if ("href".equalsIgnoreCase(attr.getNodeName())) { + try { + return new URL(attr.getNodeValue()); + } catch (MalformedURLException e) { + } + } + } + } + } + } + + // no. + return null; + } + + private boolean hasOnlyWhiteSpace(Node node) { + String val = node.getNodeValue(); + for (int i = 0; i < val.length(); i++) { + if (!Character.isWhitespace(val.charAt(i))) + return false; + } + return true; + } + + // this only covers a few cases of empty links that are symptomatic + // of nekohtml's DOM-fixup process... + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { + if (childLen == 0) { + // this has no inner structure + if (params.childLen == 0) + return false; + else + return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // single nested link + return true; + + } else if (childLen == 2) { + + Node c0 = children.item(0); + Node c1 = children.item(1); + + if ((c0.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c0.getNodeName())) + && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { + // single link followed by whitespace node + return true; + } + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { + // whitespace node followed by single link + return true; + } + + } else if (childLen == 3) { + Node c0 = children.item(0); + Node c1 = children.item(1); + Node c2 = children.item(2); + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2)) { + // single link surrounded by whitespace nodes + return true; + } + } + + return false; + } + + /** + * This method finds all anchors below the supplied DOM <code>node</code>, and + * creates appropriate {@link Outlink} records for each (relative to the + * supplied <code>base</code> URL), and adds them to the <code>outlinks</code> + * {@link ArrayList}. + * + * <p> + * + * Links without inner structure (tags, text, etc) are discarded, as are links + * which contain only single nested links and empty text nodes (this is a + * common DOM-fixup artifact, at least with nekohtml). + */ + public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) { + + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + NodeList children = currentNode.getChildNodes(); + int childLen = (children != null) ? children.getLength() : 0; + + if (nodeType == Node.ELEMENT_NODE) { + + nodeName = nodeName.toLowerCase(); + LinkParams params = (LinkParams) linkParams.get(nodeName); + if (params != null) { + if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { + + StringBuffer linkText = new StringBuffer(); + getText(linkText, currentNode, true); + + NamedNodeMap attrs = currentNode.getAttributes(); + String target = null; + boolean noFollow = false; + boolean post = false; + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); + } else if ("rel".equalsIgnoreCase(attrName) + && "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } else if ("method".equalsIgnoreCase(attrName) + && "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; + } + } + if (target != null && !noFollow && !post) + try { + + URL url = URLUtil.resolveURL(base, target); + outlinks.add(new Outlink(url.toString(), linkText.toString() + .trim())); + } catch (MalformedURLException e) { + // don't care + } + } + // this should not have any children, skip them + if (params.childLen == 0) + continue; + } + } + } + } + + // This one is used by NUTCH-1918 + public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) { + String target = null; + String anchor = null; + boolean noFollow = false; + + for (Link link : tikaExtractedOutlinks) { + target = link.getUri(); + noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false; + anchor = link.getText(); + + if (!ignoredTags.contains(link.getType())) { + if (target != null && !noFollow) { + try { + URL url = URLUtil.resolveURL(base, target); + + // clean the anchor + anchor = anchor.replaceAll("\\s+", " "); + anchor = anchor.trim(); + + outlinks.add(new Outlink(url.toString(), anchor)); + } catch (MalformedURLException e) { + // don't care + } + } + } + } + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java new file mode 100644 index 0000000..294bde9 --- /dev/null +++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java @@ -0,0 +1,214 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.tika; + +import java.net.URL; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.w3c.dom.*; + +/** + * Class for parsing META Directives from DOM trees. This class handles + * specifically Robots META directives (all, none, nofollow, noindex), finding + * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are + * stored in a HTMLMetaTags instance. + */ +public class HTMLMetaProcessor { + + /** + * Utility class with indicators for the robots directives "noindex" and + * "nofollow", and HTTP-EQUIV/no-cache + */ + + /** + * Sets the indicators in <code>robotsMeta</code> to appropriate values, based + * on any META tags found under the given <code>node</code>. + */ + public static final void getMetaTags(HTMLMetaTags metaTags, Node node, + URL currURL) { + + metaTags.reset(); + getMetaTagsHelper(metaTags, node, currURL); + } + + private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node, + URL currURL) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(node.getNodeName())) { + // META tags should not be under body + return; + } + + if ("meta".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node nameNode = null; + Node equivNode = null; + Node contentNode = null; + // Retrieves name, http-equiv and content attribues + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName().toLowerCase(); + if (attrName.equals("name")) { + nameNode = attr; + } else if (attrName.equals("http-equiv")) { + equivNode = attr; + } else if (attrName.equals("content")) { + contentNode = attr; + } + } + + if (nameNode != null) { + if (contentNode != null) { + String name = nameNode.getNodeValue().toLowerCase(); + metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); + if ("robots".equals(name)) { + + if (contentNode != null) { + String directives = contentNode.getNodeValue().toLowerCase(); + int index = directives.indexOf("none"); + + if (index >= 0) { + metaTags.setNoIndex(); + metaTags.setNoFollow(); + } + + index = directives.indexOf("all"); + if (index >= 0) { + // do nothing... + } + + index = directives.indexOf("noindex"); + if (index >= 0) { + metaTags.setNoIndex(); + } + + index = directives.indexOf("nofollow"); + if (index >= 0) { + metaTags.setNoFollow(); + } + + index = directives.indexOf("noarchive"); + if (index >= 0) { + metaTags.setNoCache(); + } + } + + } // end if (name == robots) + } + } + + if (equivNode != null) { + if (contentNode != null) { + String name = equivNode.getNodeValue().toLowerCase(); + String content = contentNode.getNodeValue(); + metaTags.getHttpEquivTags().setProperty(name, content); + if ("pragma".equals(name)) { + content = content.toLowerCase(); + int index = content.indexOf("no-cache"); + if (index >= 0) + metaTags.setNoCache(); + } else if ("refresh".equals(name)) { + int idx = content.indexOf(';'); + String time = null; + if (idx == -1) { // just the refresh time + time = content; + } else + time = content.substring(0, idx); + try { + metaTags.setRefreshTime(Integer.parseInt(time)); + // skip this if we couldn't parse the time + metaTags.setRefresh(true); + } catch (Exception e) { + ; + } + URL refreshUrl = null; + if (metaTags.getRefresh() && idx != -1) { // set the URL + idx = content.toLowerCase().indexOf("url="); + if (idx == -1) { // assume a mis-formatted entry with just the + // url + idx = content.indexOf(';') + 1; + } else + idx += 4; + if (idx != -1) { + String url = content.substring(idx); + try { + refreshUrl = new URL(url); + } catch (Exception e) { + // XXX according to the spec, this has to be an absolute + // XXX url. However, many websites use relative URLs and + // XXX expect browsers to handle that. + // XXX Unfortunately, in some cases this may create a + // XXX infinitely recursive paths (a crawler trap)... + // if (!url.startsWith("/")) url = "/" + url; + try { + refreshUrl = new URL(currURL, url); + } catch (Exception e1) { + refreshUrl = null; + } + } + } + } + if (metaTags.getRefresh()) { + if (refreshUrl == null) { + // apparently only refresh time was present. set the URL + // to the same URL. + refreshUrl = currURL; + } + metaTags.setRefreshHref(refreshUrl); + } + } + } + } + + } else if ("base".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node hrefNode = attrs.getNamedItem("href"); + + if (hrefNode != null) { + String urlString = hrefNode.getNodeValue(); + + URL url = null; + try { + if (currURL == null) + url = new URL(urlString); + else + url = new URL(currURL, urlString); + } catch (Exception e) { + ; + } + + if (url != null) + metaTags.setBaseHref(url); + } + + } + + } + + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + getMetaTagsHelper(metaTags, children.item(i), currURL); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java new file mode 100644 index 0000000..5d7eca9 --- /dev/null +++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.tika; + +import java.io.ByteArrayInputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.html.dom.HTMLDocumentImpl; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilters; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.protocol.Content; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.html.BoilerpipeContentHandler; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.HtmlMapper; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.sax.Link; +import org.apache.tika.sax.LinkContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; +import org.xml.sax.ContentHandler; + +/** + * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML + * representation returned by Tika as SAX events + ***/ + +public class TikaParser implements org.apache.nutch.parse.Parser { + + public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class); + + private Configuration conf; + private TikaConfig tikaConfig = null; + private DOMContentUtils utils; + private HtmlParseFilters htmlParseFilters; + private String cachingPolicy; + private HtmlMapper HTMLMapper; + private boolean upperCaseElementNames = true; + + @SuppressWarnings("deprecation") + public ParseResult getParse(Content content) { + String mimeType = content.getContentType(); + + boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe"); + String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor"); + + URL base; + try { + base = new URL(content.getBaseUrl()); + } catch (MalformedURLException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + // get the right parser using the mime type as a clue + Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); + byte[] raw = content.getContent(); + + if (parser == null) { + String message = "Can't retrieve Tika parser for mime-type " + mimeType; + LOG.error(message); + return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult( + content.getUrl(), getConf()); + } + + LOG.debug("Using Tika parser " + parser.getClass().getName() + + " for mime-type " + mimeType); + + Metadata tikamd = new Metadata(); + + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + doc.setErrorChecking(false); + DocumentFragment root = doc.createDocumentFragment(); + + ContentHandler domHandler; + + // Check whether to use Tika's BoilerplateContentHandler + if (useBoilerpipe) { + BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), + BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); + bpHandler.setIncludeMarkup(true); + domHandler = (ContentHandler)bpHandler; + } else { + DOMBuilder domBuilder = new DOMBuilder(doc, root); + domBuilder.setUpperCaseElementNames(upperCaseElementNames); + domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML); + domHandler = (ContentHandler)domBuilder; + } + + LinkContentHandler linkContentHandler = new LinkContentHandler(); + + ParseContext context = new ParseContext(); + TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler); + + if (HTMLMapper != null) + context.set(HtmlMapper.class, HTMLMapper); + tikamd.set(Metadata.CONTENT_TYPE, mimeType); + try { + parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context); + } catch (Exception e) { + LOG.error("Error parsing " + content.getUrl(), e); + return new ParseStatus(ParseStatus.FAILED, e.getMessage()) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + HTMLMetaTags metaTags = new HTMLMetaTags(); + String text = ""; + String title = ""; + Outlink[] outlinks = new Outlink[0]; + org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); + + // we have converted the sax events generated by Tika into a DOM object + // so we can now use the usual HTML resources from Nutch + // get meta directives + HTMLMetaProcessor.getMetaTags(metaTags, root, base); + if (LOG.isTraceEnabled()) { + LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); + } + + // check meta directives + if (!metaTags.getNoIndex()) { // okay to index + StringBuffer sb = new StringBuffer(); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting text..."); + } + utils.getText(sb, root); // extract text + text = sb.toString(); + sb.setLength(0); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting title..."); + } + utils.getTitle(sb, root); // extract title + title = sb.toString().trim(); + } + + if (!metaTags.getNoFollow()) { // okay to follow links + ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks + URL baseTag = utils.getBase(root); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting links..."); + } + + // pre-1233 outlink extraction + //utils.getOutlinks(baseTag != null ? baseTag : base, l, root); + // Get outlinks from Tika + List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks(); + utils.getOutlinks(baseTag != null ? baseTag : base, l, tikaExtractedOutlinks); + outlinks = l.toArray(new Outlink[l.size()]); + if (LOG.isTraceEnabled()) { + LOG.trace("found " + outlinks.length + " outlinks in " + + content.getUrl()); + } + } + + // populate Nutch metadata with Tika metadata + String[] TikaMDNames = tikamd.names(); + for (String tikaMDName : TikaMDNames) { + if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) + continue; + String[] values = tikamd.getValues(tikaMDName); + for (String v : values) + nutchMetadata.add(tikaMDName, v); + } + + // no outlinks? try OutlinkExtractor e.g works for mime types where no + // explicit markup for anchors + + if (outlinks.length == 0) { + outlinks = OutlinkExtractor.getOutlinks(text, getConf()); + } + + ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); + if (metaTags.getRefresh()) { + status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); + status.setArgs(new String[] { metaTags.getRefreshHref().toString(), + Integer.toString(metaTags.getRefreshTime()) }); + } + ParseData parseData = new ParseData(status, title, outlinks, + content.getMetadata(), nutchMetadata); + ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), + new ParseImpl(text, parseData)); + + // run filters on parse + ParseResult filteredParse = this.htmlParseFilters.filter(content, + parseResult, metaTags, root); + if (metaTags.getNoCache()) { // not okay to cache + for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) + entry.getValue().getData().getParseMeta() + .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); + } + return filteredParse; + } + + public void setConf(Configuration conf) { + this.conf = conf; + this.tikaConfig = null; + + // do we want a custom Tika configuration file + // deprecated since Tika 0.7 which is based on + // a service provider based configuration + String customConfFile = conf.get("tika.config.file"); + if (customConfFile != null) { + try { + // see if a Tika config file can be found in the job file + URL customTikaConfig = conf.getResource(customConfFile); + if (customTikaConfig != null) + tikaConfig = new TikaConfig(customTikaConfig); + } catch (Exception e1) { + String message = "Problem loading custom Tika configuration from " + + customConfFile; + LOG.error(message, e1); + } + } else { + try { + tikaConfig = new TikaConfig(this.getClass().getClassLoader()); + } catch (Exception e2) { + String message = "Problem loading default Tika configuration"; + LOG.error(message, e2); + } + } + + // use a custom htmlmapper + String htmlmapperClassName = conf.get("tika.htmlmapper.classname"); + if (StringUtils.isNotBlank(htmlmapperClassName)) { + try { + Class HTMLMapperClass = Class.forName(htmlmapperClassName); + boolean interfaceOK = HtmlMapper.class + .isAssignableFrom(HTMLMapperClass); + if (!interfaceOK) { + throw new RuntimeException("Class " + htmlmapperClassName + + " does not implement HtmlMapper"); + } + HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance(); + } catch (Exception e) { + LOG.error("Can't generate instance for class " + htmlmapperClassName); + throw new RuntimeException("Can't generate instance for class " + + htmlmapperClassName); + } + } + + this.htmlParseFilters = new HtmlParseFilters(getConf()); + this.utils = new DOMContentUtils(conf); + this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", + Nutch.CACHING_FORBIDDEN_CONTENT); + this.upperCaseElementNames = getConf().getBoolean( + "tika.uppercase.element.names", true); + } + + public Configuration getConf() { + return this.conf; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java new file mode 100644 index 0000000..d625c33 --- /dev/null +++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java @@ -0,0 +1,112 @@ +/* + * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer, + * XXX in order to avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $ + */ +package org.apache.nutch.parse.tika; + +/** + * Class used to verify whether the specified <var>ch</var> conforms to the XML + * 1.0 definition of whitespace. + */ +class XMLCharacterRecognizer { + + /** + * Returns whether the specified <var>ch</var> conforms to the XML 1.0 + * definition of whitespace. Refer to <A + * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of + * <CODE>S</CODE></A> for details. + * + * @param ch + * Character to check as XML whitespace. + * @return =true if <var>ch</var> is XML whitespace; otherwise =false. + */ + static boolean isWhiteSpace(char ch) { + return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA); + } + + /** + * Tell if the string is whitespace. + * + * @param ch + * Character array to check as XML whitespace. + * @param start + * Start index of characters in the array + * @param length + * Number of characters in the array + * @return True if the characters in the array are XML whitespace; otherwise, + * false. + */ + static boolean isWhiteSpace(char ch[], int start, int length) { + + int end = start + length; + + for (int s = start; s < end; s++) { + if (!isWhiteSpace(ch[s])) + return false; + } + + return true; + } + + /** + * Tell if the string is whitespace. + * + * @param buf + * StringBuffer to check as XML whitespace. + * @return True if characters in buffer are XML whitespace, false otherwise + */ + static boolean isWhiteSpace(StringBuffer buf) { + + int n = buf.length(); + + for (int i = 0; i < n; i++) { + if (!isWhiteSpace(buf.charAt(i))) + return false; + } + + return true; + } + + /** + * Tell if the string is whitespace. + * + * @param s + * String to check as XML whitespace. + * @return True if characters in buffer are XML whitespace, false otherwise + */ + static boolean isWhiteSpace(String s) { + + if (null != s) { + int n = s.length(); + + for (int i = 0; i < n; i++) { + if (!isWhiteSpace(s.charAt(i))) + return false; + } + } + + return true; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java new file mode 100644 index 0000000..19e3f47 --- /dev/null +++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse various document formats with help of + * <a href="http://tika.apache.org/">Apache Tika</a>. + */ +package org.apache.nutch.parse.tika; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java new file mode 100644 index 0000000..96029a6 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java @@ -0,0 +1,337 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.tika.DOMContentUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.ByteArrayInputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Unit tests for DOMContentUtils. + */ +public class TestDOMContentUtils { + + private static final String[] testPages = { + + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + + // test frameset link extraction. The invalid frame in the middle + // will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" + + "</frame>" + "<frame src=\"right.html\">" + "</frame>" + + "</frameset>" + "</frameset>" + "</body></html>"), + + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; + + private static int SKIP = 9; + + private static String[] testBaseHrefs = { "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something" }; + + private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs = new URL[testPages.length]; + + private static final String[] answerText = { + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5" }; + + private static final String[] answerTitle = { "title", "title", "", + "my title", "my title", "my title", "my title", "", "", "", "title", + "title" }; + + // note: should be in page-order + private static Outlink[][] answerOutlinks; + + private static Configuration conf; + private static DOMContentUtils utils = null; + + @Before + public void setup() throws Exception { + conf = NutchConfiguration.create(); + conf.setBoolean("parser.html.form.use_action", true); + utils = new DOMContentUtils(conf); + DOMFragmentParser parser = new DOMFragmentParser(); + parser.setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + for (int i = 0; i < testPages.length; i++) { + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + try { + parser.parse( + new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), + node); + testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); + } catch (Exception e) { + Assert.assertTrue("caught exception: " + e, false); + } + testDOMs[i] = node; + } + answerOutlinks = new Outlink[][] { + { new Outlink("http://www.nutch.org", "anchor"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + {}, + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + {}, + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") } }; + + } + + private static boolean equalsIgnoreWhitespace(String s1, String s2) { + StringTokenizer st1 = new StringTokenizer(s1); + StringTokenizer st2 = new StringTokenizer(s2); + + while (st1.hasMoreTokens()) { + if (!st2.hasMoreTokens()) + return false; + if (!st1.nextToken().equals(st2.nextToken())) + return false; + } + if (st2.hasMoreTokens()) + return false; + return true; + } + + @Test + public void testGetText() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getText(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerText[i], text)); + } + } + + @Test + public void testGetTitle() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getTitle(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerTitle[i], text)); + } + } + + @Test + public void testGetOutlinks() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); + if (i == SKIP) { + conf.setBoolean("parser.html.form.use_action", false); + utils.setConf(conf); + } else { + conf.setBoolean("parser.html.form.use_action", true); + utils.setConf(conf); + } + utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); + Outlink[] outlinkArr = new Outlink[outlinks.size()]; + outlinkArr = outlinks.toArray(outlinkArr); + compareOutlinks(answerOutlinks[i], outlinkArr); + } + } + + private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { + for (int i = 0; i < o.length; i++) { + sb.append(o[i].toString()); + sb.append(System.getProperty("line.separator")); + } + } + + private static final String outlinksString(Outlink[] o) { + StringBuffer sb = new StringBuffer(); + appendOutlinks(sb, o); + return sb.toString(); + } + + private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { + if (o1.length != o2.length) { + Assert.assertTrue( + "got wrong number of outlinks (expecting " + o1.length + ", got " + + o2.length + ")" + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + outlinksString(o1) + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + outlinksString(o2) + System.getProperty("line.separator"), + false); + } + + for (int i = 0; i < o1.length; i++) { + if (!o1[i].equals(o2[i])) { + Assert.assertTrue( + "got wrong outlinks at position " + i + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + "'" + o1[i].getToUrl() + + "', anchor: '" + o1[i].getAnchor() + "'" + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + "'" + o2[i].getToUrl() + + "', anchor: '" + o2[i].getAnchor() + "'", false); + } + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java new file mode 100644 index 0000000..c9394dc --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.tika.TikaParser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; + +/** + * + * @author mattmann / jnioche + * + * Test Suite for the RSS feeds with the {@link TikaParser}. + * + */ +public class TestFeedParser { + + private String fileSeparator = System.getProperty("file.separator"); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + private String[] sampleFiles = { "rsstest.rss" }; + + public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class + .getName()); + + /** + * <p> + * The test method: tests out the following 2 asserts: + * </p> + * + * <ul> + * <li>There are 3 outlinks read from the sample rss file</li> + * <li>The 3 outlinks read are in fact the correct outlinks from the sample + * file</li> + * </ul> + */ + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + // check that there are 2 outlinks: + // unlike the original parse-rss + // tika ignores the URL and description of the channel + + // http://test.channel.com + // http://www-scf.usc.edu/~mattmann/ + // http://www.nutch.org + + ParseData theParseData = parse.getData(); + + Outlink[] theOutlinks = theParseData.getOutlinks(); + + Assert.assertTrue("There aren't 2 outlinks read!", + theOutlinks.length == 2); + + // now check to make sure that those are the two outlinks + boolean hasLink1 = false, hasLink2 = false; + + for (int j = 0; j < theOutlinks.length; j++) { + if (theOutlinks[j].getToUrl().equals( + "http://www-scf.usc.edu/~mattmann/")) { + hasLink1 = true; + } + + if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) { + hasLink2 = true; + } + } + + if (!hasLink1 || !hasLink2) { + Assert.fail("Outlinks read from sample rss file are not correct!"); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java new file mode 100644 index 0000000..b1762e6 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Test extraction of image metadata + */ +public class TestImageMetadata { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + private String[] sampleFiles = { "nutch_logo_tm.gif", }; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + Configuration conf = NutchConfiguration.create(); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + Assert.assertEquals("121", parse.getData().getMeta("width")); + Assert.assertEquals("48", parse.getData().getMeta("height")); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java new file mode 100644 index 0000000..576b3df --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java @@ -0,0 +1,92 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; + +/** + * Unit tests for MSWordParser. + * + * @author John Xing + */ +public class TestMSWordParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-msword/build.xml during plugin compilation. + // Check ./src/plugin/parse-msword/sample/README.txt for what they are. + private String[] sampleFiles = { "word97.doc" }; + + private String expectedText = "This is a sample doc file prepared for nutch."; + + private Configuration conf; + + @Before + public void setUp() { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + } + + public String getTextContent(String fileName) throws ProtocolException, + ParseException { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + return parse.getText(); + } + + @Test + public void testIt() throws ProtocolException, ParseException { + for (int i = 0; i < sampleFiles.length; i++) { + String found = getTextContent(sampleFiles[i]); + Assert.assertTrue("text found : '" + found + "'", + found.startsWith(expectedText)); + } + } + + @Test + public void testOpeningDocs() throws ProtocolException, ParseException { + String[] filenames = new File(sampleDir).list(); + for (int i = 0; i < filenames.length; i++) { + if (filenames[i].endsWith(".doc") == false) + continue; + Assert.assertTrue("cann't read content of " + filenames[i], + getTextContent(filenames[i]).length() > 0); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java new file mode 100644 index 0000000..6960bad --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import java.io.FileInputStream; +import java.io.InputStreamReader; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.protocol.*; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for OOParser. + * + * @author Andrzej Bialecki + */ +public class TestOOParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-oo/build.xml during plugin compilation. + private String[] sampleFiles = { "ootest.odt", "ootest.sxw" }; + + private String expectedText; + + private String sampleText = "ootest.txt"; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Content content; + Parse parse; + Configuration conf = NutchConfiguration.create(); + Protocol protocol; + ProtocolFactory factory = new ProtocolFactory(conf); + + System.out.println("Expected : " + expectedText); + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + if (sampleFiles[i].startsWith("ootest") == false) + continue; + + protocol = factory.getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + + // simply test for the presence of a text - the ordering of the elements + // may differ from what was expected + // in the previous tests + Assert.assertTrue(text != null && text.length() > 0); + + System.out.println("Found " + sampleFiles[i] + ": " + text); + } + } + + public TestOOParser() { + try { + // read the test string + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + + sampleText); + StringBuffer sb = new StringBuffer(); + int len = 0; + InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); + char[] buf = new char[1024]; + while ((len = isr.read(buf)) > 0) { + sb.append(buf, 0, len); + } + isr.close(); + expectedText = sb.toString(); + // normalize space + expectedText = expectedText.replaceAll("[ \t\r\n]+", " "); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java new file mode 100644 index 0000000..9884f0c --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for PdfParser. + * + * @author John Xing + */ +public class TestPdfParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-pdf/build.xml during plugin compilation. + // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. + private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; + + private String expectedText = "A VERY SMALL PDF FILE"; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + Configuration conf = NutchConfiguration.create(); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + int index = parse.getText().indexOf(expectedText); + Assert.assertTrue(index > 0); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java new file mode 100644 index 0000000..f15d821 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.tika; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). + * + * @author Andy Hedges + */ +public class TestRTFParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-rtf/build.xml during plugin compilation. + // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. + private String rtfFile = "test.rtf"; + + @Ignore("There seems to be an issue with line 71 e.g. text.trim()") + @Test + public void testIt() throws ProtocolException, ParseException { + + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + urlString = "file:" + sampleDir + fileSeparator + rtfFile; + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) + .getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get( + content.getUrl()); + String text = parse.getText(); + Assert.assertEquals("The quick brown fox jumps over the lazy dog", + text.trim()); + + String title = parse.getData().getTitle(); + Metadata meta = parse.getData().getParseMeta(); + + Assert.assertEquals("test rft document", title); + Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT)); + + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java new file mode 100644 index 0000000..4224f93 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java @@ -0,0 +1,156 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.tika.HTMLMetaProcessor; + +import java.io.ByteArrayInputStream; +import java.net.URL; + +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.junit.Assert; +import org.junit.Test; + +/** Unit tests for HTMLMetaProcessor. */ +public class TestRobotsMetaProcessor { + + /* + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> + */ + + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", + + }; + + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! + }; + + private URL[][] currURLsAndAnswers; + + @Test + public void testRobotsMetaProcessor() { + DOMFragmentParser parser = new DOMFragmentParser(); + ; + + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; + } catch (Exception e) { + Assert.assertTrue("couldn't make test URLs!", false); + } + + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); + + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + + try { + parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); + } catch (Exception e) { + e.printStackTrace(); + } + + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); + + Assert.assertTrue("got index wrong on test " + i, + robotsMeta.getNoIndex() == answers[i][0]); + Assert.assertTrue("got follow wrong on test " + i, + robotsMeta.getNoFollow() == answers[i][1]); + Assert.assertTrue("got cache wrong on test " + i, + robotsMeta.getNoCache() == answers[i][2]); + Assert + .assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); + + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf new file mode 100644 index 0000000..383cebb Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf differ
