http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java deleted file mode 100644 index 77a1044..0000000 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java +++ /dev/null @@ -1,794 +0,0 @@ -/* - * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0 - * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to - * avoid dependency on Xalan. - */ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $ - */ -package org.apache.nutch.parse.tika; - -import java.util.Stack; - -import org.w3c.dom.Comment; -import org.w3c.dom.Document; -import org.w3c.dom.DocumentFragment; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.Text; -import org.w3c.dom.CDATASection; - -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.Locator; -import org.xml.sax.ext.LexicalHandler; - -/** - * This class takes SAX events (in addition to some extra events that SAX - * doesn't handle yet) and adds the result to a document or document fragment. - */ -class DOMBuilder implements ContentHandler, LexicalHandler { - private boolean upperCaseElementNames = true; - - /** Root document */ - public Document m_doc; - - /** Current node */ - protected Node m_currentNode = null; - - /** First node of document fragment or null if not a DocumentFragment */ - public DocumentFragment m_docFrag = null; - - /** Vector of element nodes */ - protected Stack<Element> m_elemStack = new Stack<Element>(); - - /** - * Element recorded with this namespace will be converted to Node without a - * namespace - */ - private String defaultNamespaceURI = null; - - /** - * DOMBuilder instance constructor... it will add the DOM nodes to the - * document fragment. - * - * @param doc - * Root document - * @param node - * Current node - */ - DOMBuilder(Document doc, Node node) { - m_doc = doc; - m_currentNode = node; - } - - /** - * DOMBuilder instance constructor... it will add the DOM nodes to the - * document fragment. - * - * @param doc - * Root document - * @param docFrag - * Document fragment - */ - DOMBuilder(Document doc, DocumentFragment docFrag) { - m_doc = doc; - m_docFrag = docFrag; - } - - /** - * DOMBuilder instance constructor... it will add the DOM nodes to the - * document. - * - * @param doc - * Root document - */ - DOMBuilder(Document doc) { - m_doc = doc; - } - - /** - * Get the root node of the DOM being created. This is either a Document or a - * DocumentFragment. - * - * @return The root document or document fragment if not null - */ - Node getRootNode() { - return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; - } - - /** - * Get the node currently being processed. - * - * @return the current node being processed - */ - Node getCurrentNode() { - return m_currentNode; - } - - /** - * Return null since there is no Writer for this class. - * - * @return null - */ - java.io.Writer getWriter() { - return null; - } - - /** - * Append a node to the current container. - * - * @param newNode - * New node to append - */ - protected void append(Node newNode) throws org.xml.sax.SAXException { - - Node currentNode = m_currentNode; - - if (null != currentNode) { - currentNode.appendChild(newNode); - - // System.out.println(newNode.getNodeName()); - } else if (null != m_docFrag) { - m_docFrag.appendChild(newNode); - } else { - boolean ok = true; - short type = newNode.getNodeType(); - - if (type == Node.TEXT_NODE) { - String data = newNode.getNodeValue(); - - if ((null != data) && (data.trim().length() > 0)) { - throw new org.xml.sax.SAXException( - "Warning: can't output text before document element! Ignoring..."); - } - - ok = false; - } else if (type == Node.ELEMENT_NODE) { - if (m_doc.getDocumentElement() != null) { - throw new org.xml.sax.SAXException( - "Can't have more than one root on a DOM!"); - } - } - - if (ok) - m_doc.appendChild(newNode); - } - } - - /** - * Receive an object for locating the origin of SAX document events. - * - * <p> - * SAX parsers are strongly encouraged (though not absolutely required) to - * supply a locator: if it does so, it must supply the locator to the - * application by invoking this method before invoking any of the other - * methods in the ContentHandler interface. - * </p> - * - * <p> - * The locator allows the application to determine the end position of any - * document-related event, even if the parser is not reporting an error. - * Typically, the application will use this information for reporting its own - * errors (such as character content that does not match an application's - * business rules). The information returned by the locator is probably not - * sufficient for use with a search engine. - * </p> - * - * <p> - * Note that the locator will return correct information only during the - * invocation of the events in this interface. The application should not - * attempt to use it at any other time. - * </p> - * - * @param locator - * An object that can return the location of any SAX document event. - * @see org.xml.sax.Locator - */ - public void setDocumentLocator(Locator locator) { - - // No action for the moment. - } - - /** - * Receive notification of the beginning of a document. - * - * <p> - * The SAX parser will invoke this method only once, before any other methods - * in this interface or in DTDHandler (except for setDocumentLocator). - * </p> - */ - public void startDocument() throws org.xml.sax.SAXException { - - // No action for the moment. - } - - /** - * Receive notification of the end of a document. - * - * <p> - * The SAX parser will invoke this method only once, and it will be the last - * method invoked during the parse. The parser shall not invoke this method - * until it has either abandoned parsing (because of an unrecoverable error) - * or reached the end of input. - * </p> - */ - public void endDocument() throws org.xml.sax.SAXException { - - // No action for the moment. - } - - /** - * Receive notification of the beginning of an element. - * - * <p> - * The Parser will invoke this method at the beginning of every element in the - * XML document; there will be a corresponding endElement() event for every - * startElement() event (even when the element is empty). All of the element's - * content will be reported, in order, before the corresponding endElement() - * event. - * </p> - * - * <p> - * If the element name has a namespace prefix, the prefix will still be - * attached. Note that the attribute list provided will contain only - * attributes with explicit values (specified or defaulted): #IMPLIED - * attributes will be omitted. - * </p> - * - * - * @param ns - * The namespace of the node - * @param localName - * The local part of the qualified name - * @param name - * The element name. - * @param atts - * The attributes attached to the element, if any. - * @see #endElement - * @see org.xml.sax.Attributes - */ - public void startElement(String ns, String localName, String name, - Attributes atts) throws org.xml.sax.SAXException { - - Element elem; - - if (upperCaseElementNames) - name = name.toUpperCase(); - - // Note that the namespace-aware call must be used to correctly - // construct a Level 2 DOM, even for non-namespaced nodes. - if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI)) - elem = m_doc.createElementNS(null, name); - else - elem = m_doc.createElementNS(ns, name); - - append(elem); - - try { - int nAtts = atts.getLength(); - - if (0 != nAtts) { - for (int i = 0; i < nAtts; i++) { - - // System.out.println("type " + atts.getType(i) + " name " + - // atts.getLocalName(i) ); - // First handle a possible ID attribute - if (atts.getType(i).equalsIgnoreCase("ID")) - setIDAttribute(atts.getValue(i), elem); - - String attrNS = atts.getURI(i); - - if ("".equals(attrNS)) - attrNS = null; // DOM represents no-namespace as null - - // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) - // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); - // Crimson won't let us set an xmlns: attribute on the DOM. - String attrQName = atts.getQName(i); - - // In SAX, xmlns: attributes have an empty namespace, while in DOM - // they should have the xmlns namespace - if (attrQName.startsWith("xmlns:")) - attrNS = "http://www.w3.org/2000/xmlns/"; - - // ALWAYS use the DOM Level 2 call! - elem.setAttributeNS(attrNS, attrQName, atts.getValue(i)); - } - } - - // append(elem); - - m_elemStack.push(elem); - - m_currentNode = elem; - - // append(elem); - } catch (java.lang.Exception de) { - // de.printStackTrace(); - throw new org.xml.sax.SAXException(de); - } - - } - - /** - * - * - * - * Receive notification of the end of an element. - * - * <p> - * The SAX parser will invoke this method at the end of every element in the - * XML document; there will be a corresponding startElement() event for every - * endElement() event (even when the element is empty). - * </p> - * - * <p> - * If the element name has a namespace prefix, the prefix will still be - * attached to the name. - * </p> - * - * - * @param ns - * the namespace of the element - * @param localName - * The local part of the qualified name of the element - * @param name - * The element name - */ - public void endElement(String ns, String localName, String name) - throws org.xml.sax.SAXException { - if (!m_elemStack.isEmpty()) { - m_elemStack.pop(); - } - m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek(); - } - - /** - * Set an ID string to node association in the ID table. - * - * @param id - * The ID string. - * @param elem - * The associated ID. - */ - public void setIDAttribute(String id, Element elem) { - - // Do nothing. This method is meant to be overiden. - } - - /** - * Receive notification of character data. - * - * <p> - * The Parser will call this method to report each chunk of character data. - * SAX parsers may return all contiguous character data in a single chunk, or - * they may split it into several chunks; however, all of the characters in - * any single event must come from the same external entity, so that the - * Locator provides useful information. - * </p> - * - * <p> - * The application must not attempt to read from the array outside of the - * specified range. - * </p> - * - * <p> - * Note that some parsers will report whitespace using the - * ignorableWhitespace() method rather than this one (validating parsers must - * do so). - * </p> - * - * @param ch - * The characters from the XML document. - * @param start - * The start position in the array. - * @param length - * The number of characters to read from the array. - * @see #ignorableWhitespace - * @see org.xml.sax.Locator - */ - public void characters(char ch[], int start, int length) - throws org.xml.sax.SAXException { - if (isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error - - if (m_inCData) { - cdata(ch, start, length); - - return; - } - - String s = new String(ch, start, length); - Node childNode; - childNode = m_currentNode != null ? m_currentNode.getLastChild() : null; - if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) { - ((Text) childNode).appendData(s); - } else { - Text text = m_doc.createTextNode(s); - append(text); - } - } - - /** - * If available, when the disable-output-escaping attribute is used, output - * raw text without escaping. A PI will be inserted in front of the node with - * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom". - * - * @param ch - * Array containing the characters - * @param start - * Index to start of characters in the array - * @param length - * Number of characters in the array - */ - public void charactersRaw(char ch[], int start, int length) - throws org.xml.sax.SAXException { - if (isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error - - String s = new String(ch, start, length); - - append(m_doc.createProcessingInstruction("xslt-next-is-raw", - "formatter-to-dom")); - append(m_doc.createTextNode(s)); - } - - /** - * Report the beginning of an entity. - * - * The start and end of the document entity are not reported. The start and - * end of the external DTD subset are reported using the pseudo-name "[dtd]". - * All other events must be properly nested within start/end entity events. - * - * @param name - * The name of the entity. If it is a parameter entity, the name will - * begin with '%'. - * @see #endEntity - * @see org.xml.sax.ext.DeclHandler#internalEntityDecl - * @see org.xml.sax.ext.DeclHandler#externalEntityDecl - */ - public void startEntity(String name) throws org.xml.sax.SAXException { - - // Almost certainly the wrong behavior... - // entityReference(name); - } - - /** - * Report the end of an entity. - * - * @param name - * The name of the entity that is ending. - * @see #startEntity - */ - public void endEntity(String name) throws org.xml.sax.SAXException { - } - - /** - * Receive notivication of a entityReference. - * - * @param name - * name of the entity reference - */ - public void entityReference(String name) throws org.xml.sax.SAXException { - append(m_doc.createEntityReference(name)); - } - - /** - * Receive notification of ignorable whitespace in element content. - * - * <p> - * Validating Parsers must use this method to report each chunk of ignorable - * whitespace (see the W3C XML 1.0 recommendation, section 2.10): - * non-validating parsers may also use this method if they are capable of - * parsing and using content models. - * </p> - * - * <p> - * SAX parsers may return all contiguous whitespace in a single chunk, or they - * may split it into several chunks; however, all of the characters in any - * single event must come from the same external entity, so that the Locator - * provides useful information. - * </p> - * - * <p> - * The application must not attempt to read from the array outside of the - * specified range. - * </p> - * - * @param ch - * The characters from the XML document. - * @param start - * The start position in the array. - * @param length - * The number of characters to read from the array. - * @see #characters - */ - public void ignorableWhitespace(char ch[], int start, int length) - throws org.xml.sax.SAXException { - if (isOutsideDocElem()) - return; // avoid DOM006 Hierarchy request error - - String s = new String(ch, start, length); - - append(m_doc.createTextNode(s)); - } - - /** - * Tell if the current node is outside the document element. - * - * @return true if the current node is outside the document element. - */ - private boolean isOutsideDocElem() { - return (null == m_docFrag) - && m_elemStack.size() == 0 - && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); - } - - /** - * Receive notification of a processing instruction. - * - * <p> - * The Parser will invoke this method once for each processing instruction - * found: note that processing instructions may occur before or after the main - * document element. - * </p> - * - * <p> - * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) - * or a text declaration (XML 1.0, section 4.3.1) using this method. - * </p> - * - * @param target - * The processing instruction target. - * @param data - * The processing instruction data, or null if none was supplied. - */ - public void processingInstruction(String target, String data) - throws org.xml.sax.SAXException { - append(m_doc.createProcessingInstruction(target, data)); - } - - /** - * Report an XML comment anywhere in the document. - * - * This callback will be used for comments inside or outside the document - * element, including comments in the external DTD subset (if read). - * - * @param ch - * An array holding the characters in the comment. - * @param start - * The starting position in the array. - * @param length - * The number of characters to use from the array. - */ - public void comment(char ch[], int start, int length) - throws org.xml.sax.SAXException { - // tagsoup sometimes submits invalid values here - if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) - return; - append(m_doc.createComment(new String(ch, start, length))); - } - - /** Flag indicating that we are processing a CData section */ - protected boolean m_inCData = false; - - /** - * Report the start of a CDATA section. - * - * @see #endCDATA - */ - public void startCDATA() throws org.xml.sax.SAXException { - m_inCData = true; - append(m_doc.createCDATASection("")); - } - - /** - * Report the end of a CDATA section. - * - * @see #startCDATA - */ - public void endCDATA() throws org.xml.sax.SAXException { - m_inCData = false; - } - - /** - * Receive notification of cdata. - * - * <p> - * The Parser will call this method to report each chunk of character data. - * SAX parsers may return all contiguous character data in a single chunk, or - * they may split it into several chunks; however, all of the characters in - * any single event must come from the same external entity, so that the - * Locator provides useful information. - * </p> - * - * <p> - * The application must not attempt to read from the array outside of the - * specified range. - * </p> - * - * <p> - * Note that some parsers will report whitespace using the - * ignorableWhitespace() method rather than this one (validating parsers must - * do so). - * </p> - * - * @param ch - * The characters from the XML document. - * @param start - * The start position in the array. - * @param length - * The number of characters to read from the array. - * @see #ignorableWhitespace - * @see org.xml.sax.Locator - */ - public void cdata(char ch[], int start, int length) - throws org.xml.sax.SAXException { - if (isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error - - String s = new String(ch, start, length); - - // XXX [email protected]: modified from the original, to accomodate TagSoup. - Node n = m_currentNode.getLastChild(); - if (n instanceof CDATASection) - ((CDATASection) n).appendData(s); - else if (n instanceof Comment) - ((Comment) n).appendData(s); - } - - /** - * Report the start of DTD declarations, if any. - * - * Any declarations are assumed to be in the internal subset unless otherwise - * indicated. - * - * @param name - * The document type name. - * @param publicId - * The declared public identifier for the external DTD subset, or - * null if none was declared. - * @param systemId - * The declared system identifier for the external DTD subset, or - * null if none was declared. - * @see #endDTD - * @see #startEntity - */ - public void startDTD(String name, String publicId, String systemId) - throws org.xml.sax.SAXException { - - // Do nothing for now. - } - - /** - * Report the end of DTD declarations. - * - * @see #startDTD - */ - public void endDTD() throws org.xml.sax.SAXException { - - // Do nothing for now. - } - - /** - * Begin the scope of a prefix-URI Namespace mapping. - * - * <p> - * The information from this event is not necessary for normal Namespace - * processing: the SAX XML reader will automatically replace prefixes for - * element and attribute names when the http://xml.org/sax/features/namespaces - * feature is true (the default). - * </p> - * - * <p> - * There are cases, however, when applications need to use prefixes in - * character data or in attribute values, where they cannot safely be expanded - * automatically; the start/endPrefixMapping event supplies the information to - * the application to expand prefixes in those contexts itself, if necessary. - * </p> - * - * <p> - * Note that start/endPrefixMapping events are not guaranteed to be properly - * nested relative to each-other: all startPrefixMapping events will occur - * before the corresponding startElement event, and all endPrefixMapping - * events will occur after the corresponding endElement event, but their order - * is not guaranteed. - * </p> - * - * @param prefix - * The Namespace prefix being declared. - * @param uri - * The Namespace URI the prefix is mapped to. - * @see #endPrefixMapping - * @see #startElement - */ - public void startPrefixMapping(String prefix, String uri) - throws org.xml.sax.SAXException { - - /* - * // Not sure if this is needed or wanted // Also, it fails in the stree. - * if((null != m_currentNode) && (m_currentNode.getNodeType() == - * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) && - * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname - * = "xmlns:"+prefix; - * - * Element elem = (Element)m_currentNode; String val = - * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null) - * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname, - * uri); } } - */ - } - - /** - * End the scope of a prefix-URI mapping. - * - * <p> - * See startPrefixMapping for details. This event will always occur after the - * corresponding endElement event, but the order of endPrefixMapping events is - * not otherwise guaranteed. - * </p> - * - * @param prefix - * The prefix that was being mapping. - * @see #startPrefixMapping - * @see #endElement - */ - public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { - } - - /** - * Receive notification of a skipped entity. - * - * <p> - * The Parser will invoke this method once for each entity skipped. - * Non-validating processors may skip entities if they have not seen the - * declarations (because, for example, the entity was declared in an external - * DTD subset). All processors may skip external entities, depending on the - * values of the http://xml.org/sax/features/external-general-entities and the - * http://xml.org/sax/features/external-parameter-entities properties. - * </p> - * - * @param name - * The name of the skipped entity. If it is a parameter entity, the - * name will begin with '%'. - */ - public void skippedEntity(String name) throws org.xml.sax.SAXException { - } - - public boolean isUpperCaseElementNames() { - return upperCaseElementNames; - } - - public void setUpperCaseElementNames(boolean upperCaseElementNames) { - this.upperCaseElementNames = upperCaseElementNames; - } - - public String getDefaultNamespaceURI() { - return defaultNamespaceURI; - } - - public void setDefaultNamespaceURI(String defaultNamespaceURI) { - this.defaultNamespaceURI = defaultNamespaceURI; - } -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java deleted file mode 100644 index 5c4c990..0000000 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java +++ /dev/null @@ -1,402 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.tika; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.util.NodeWalker; -import org.apache.nutch.util.URLUtil; -import org.apache.tika.sax.Link; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; - -/** - * A collection of methods for extracting content from DOM trees. - * - * This class holds a few utility methods for pulling content out of DOM nodes, - * such as getOutlinks, getText, etc. - * - */ -public class DOMContentUtils { - - private static class LinkParams { - private String elName; - private String attrName; - private int childLen; - - private LinkParams(String elName, String attrName, int childLen) { - this.elName = elName; - this.attrName = attrName; - this.childLen = childLen; - } - - public String toString() { - return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; - } - } - - private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); - private HashSet<String> ignoredTags = new HashSet<String>(); - private Configuration conf; - - public DOMContentUtils(Configuration conf) { - setConf(conf); - } - - public void setConf(Configuration conf) { - // forceTags is used to override configurable tag ignoring, later on - Collection<String> forceTags = new ArrayList<String>(1); - - this.conf = conf; - linkParams.clear(); - linkParams.put("a", new LinkParams("a", "href", 1)); - linkParams.put("area", new LinkParams("area", "href", 0)); - if (conf.getBoolean("parser.html.form.use_action", true)) { - linkParams.put("form", new LinkParams("form", "action", 1)); - if (conf.get("parser.html.form.use_action") != null) - forceTags.add("form"); - } - linkParams.put("frame", new LinkParams("frame", "src", 0)); - linkParams.put("iframe", new LinkParams("iframe", "src", 0)); - linkParams.put("script", new LinkParams("script", "src", 0)); - linkParams.put("link", new LinkParams("link", "href", 0)); - linkParams.put("img", new LinkParams("img", "src", 0)); - - // remove unwanted link tags from the linkParams map - String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); - for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { - ignoredTags.add(ignoreTags[i].toLowerCase()); - if (!forceTags.contains(ignoreTags[i])) - linkParams.remove(ignoreTags[i]); - } - } - - /** - * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will - * append all the content text found beneath the DOM node to the - * <code>StringBuffer</code>. - * - * <p> - * - * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted - * and the <code>StringBuffer</code> will not contain any text encountered - * after a nested anchor is found. - * - * <p> - * - * @return true if nested anchors were found - */ - private boolean getText(StringBuffer sb, Node node, - boolean abortOnNestedAnchors) { - if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { - return true; - } - return false; - } - - /** - * This is a convinience method, equivalent to - * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. - * - */ - public void getText(StringBuffer sb, Node node) { - getText(sb, node, false); - } - - // returns true if abortOnNestedAnchors is true and we find nested - // anchors - private boolean getTextHelper(StringBuffer sb, Node node, - boolean abortOnNestedAnchors, int anchorDepth) { - boolean abort = false; - NodeWalker walker = new NodeWalker(node); - - while (walker.hasNext()) { - - Node currentNode = walker.nextNode(); - String nodeName = currentNode.getNodeName(); - short nodeType = currentNode.getNodeType(); - - if ("script".equalsIgnoreCase(nodeName)) { - walker.skipChildren(); - } - if ("style".equalsIgnoreCase(nodeName)) { - walker.skipChildren(); - } - if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { - anchorDepth++; - if (anchorDepth > 1) { - abort = true; - break; - } - } - if (nodeType == Node.COMMENT_NODE) { - walker.skipChildren(); - } - if (nodeType == Node.TEXT_NODE) { - // cleanup and trim the value - String text = currentNode.getNodeValue(); - text = text.replaceAll("\\s+", " "); - text = text.trim(); - if (text.length() > 0) { - if (sb.length() > 0) - sb.append(' '); - sb.append(text); - } - } - } - - return abort; - } - - /** - * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will - * append the content text found beneath the first <code>title</code> node to - * the <code>StringBuffer</code>. - * - * @return true if a title node was found, false otherwise - */ - public boolean getTitle(StringBuffer sb, Node node) { - - NodeWalker walker = new NodeWalker(node); - - while (walker.hasNext()) { - - Node currentNode = walker.nextNode(); - String nodeName = currentNode.getNodeName(); - short nodeType = currentNode.getNodeType(); - - if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD - return false; - } - - if (nodeType == Node.ELEMENT_NODE) { - if ("title".equalsIgnoreCase(nodeName)) { - getText(sb, currentNode); - return true; - } - } - } - - return false; - } - - /** If Node contains a BASE tag then it's HREF is returned. */ - URL getBase(Node node) { - - NodeWalker walker = new NodeWalker(node); - - while (walker.hasNext()) { - - Node currentNode = walker.nextNode(); - String nodeName = currentNode.getNodeName(); - short nodeType = currentNode.getNodeType(); - - // is this node a BASE tag? - if (nodeType == Node.ELEMENT_NODE) { - - if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD - return null; - } - - if ("base".equalsIgnoreCase(nodeName)) { - NamedNodeMap attrs = currentNode.getAttributes(); - for (int i = 0; i < attrs.getLength(); i++) { - Node attr = attrs.item(i); - if ("href".equalsIgnoreCase(attr.getNodeName())) { - try { - return new URL(attr.getNodeValue()); - } catch (MalformedURLException e) { - } - } - } - } - } - } - - // no. - return null; - } - - private boolean hasOnlyWhiteSpace(Node node) { - String val = node.getNodeValue(); - for (int i = 0; i < val.length(); i++) { - if (!Character.isWhitespace(val.charAt(i))) - return false; - } - return true; - } - - // this only covers a few cases of empty links that are symptomatic - // of nekohtml's DOM-fixup process... - private boolean shouldThrowAwayLink(Node node, NodeList children, - int childLen, LinkParams params) { - if (childLen == 0) { - // this has no inner structure - if (params.childLen == 0) - return false; - else - return true; - } else if ((childLen == 1) - && (children.item(0).getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { - // single nested link - return true; - - } else if (childLen == 2) { - - Node c0 = children.item(0); - Node c1 = children.item(1); - - if ((c0.getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(c0.getNodeName())) - && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { - // single link followed by whitespace node - return true; - } - - if ((c1.getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { - // whitespace node followed by single link - return true; - } - - } else if (childLen == 3) { - Node c0 = children.item(0); - Node c1 = children.item(1); - Node c2 = children.item(2); - - if ((c1.getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) - && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) - && hasOnlyWhiteSpace(c2)) { - // single link surrounded by whitespace nodes - return true; - } - } - - return false; - } - - /** - * This method finds all anchors below the supplied DOM <code>node</code>, and - * creates appropriate {@link Outlink} records for each (relative to the - * supplied <code>base</code> URL), and adds them to the <code>outlinks</code> - * {@link ArrayList}. - * - * <p> - * - * Links without inner structure (tags, text, etc) are discarded, as are links - * which contain only single nested links and empty text nodes (this is a - * common DOM-fixup artifact, at least with nekohtml). - */ - public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) { - - NodeWalker walker = new NodeWalker(node); - while (walker.hasNext()) { - - Node currentNode = walker.nextNode(); - String nodeName = currentNode.getNodeName(); - short nodeType = currentNode.getNodeType(); - NodeList children = currentNode.getChildNodes(); - int childLen = (children != null) ? children.getLength() : 0; - - if (nodeType == Node.ELEMENT_NODE) { - - nodeName = nodeName.toLowerCase(); - LinkParams params = (LinkParams) linkParams.get(nodeName); - if (params != null) { - if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { - - StringBuffer linkText = new StringBuffer(); - getText(linkText, currentNode, true); - - NamedNodeMap attrs = currentNode.getAttributes(); - String target = null; - boolean noFollow = false; - boolean post = false; - for (int i = 0; i < attrs.getLength(); i++) { - Node attr = attrs.item(i); - String attrName = attr.getNodeName(); - if (params.attrName.equalsIgnoreCase(attrName)) { - target = attr.getNodeValue(); - } else if ("rel".equalsIgnoreCase(attrName) - && "nofollow".equalsIgnoreCase(attr.getNodeValue())) { - noFollow = true; - } else if ("method".equalsIgnoreCase(attrName) - && "post".equalsIgnoreCase(attr.getNodeValue())) { - post = true; - } - } - if (target != null && !noFollow && !post) - try { - - URL url = URLUtil.resolveURL(base, target); - outlinks.add(new Outlink(url.toString(), linkText.toString() - .trim())); - } catch (MalformedURLException e) { - // don't care - } - } - // this should not have any children, skip them - if (params.childLen == 0) - continue; - } - } - } - } - - // This one is used by NUTCH-1918 - public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) { - String target = null; - String anchor = null; - boolean noFollow = false; - - for (Link link : tikaExtractedOutlinks) { - target = link.getUri(); - noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false; - anchor = link.getText(); - - if (!ignoredTags.contains(link.getType())) { - if (target != null && !noFollow) { - try { - URL url = URLUtil.resolveURL(base, target); - - // clean the anchor - anchor = anchor.replaceAll("\\s+", " "); - anchor = anchor.trim(); - - outlinks.add(new Outlink(url.toString(), anchor)); - } catch (MalformedURLException e) { - // don't care - } - } - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java deleted file mode 100644 index 294bde9..0000000 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java +++ /dev/null @@ -1,214 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.tika; - -import java.net.URL; - -import org.apache.nutch.parse.HTMLMetaTags; -import org.w3c.dom.*; - -/** - * Class for parsing META Directives from DOM trees. This class handles - * specifically Robots META directives (all, none, nofollow, noindex), finding - * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are - * stored in a HTMLMetaTags instance. - */ -public class HTMLMetaProcessor { - - /** - * Utility class with indicators for the robots directives "noindex" and - * "nofollow", and HTTP-EQUIV/no-cache - */ - - /** - * Sets the indicators in <code>robotsMeta</code> to appropriate values, based - * on any META tags found under the given <code>node</code>. - */ - public static final void getMetaTags(HTMLMetaTags metaTags, Node node, - URL currURL) { - - metaTags.reset(); - getMetaTagsHelper(metaTags, node, currURL); - } - - private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node, - URL currURL) { - - if (node.getNodeType() == Node.ELEMENT_NODE) { - - if ("body".equalsIgnoreCase(node.getNodeName())) { - // META tags should not be under body - return; - } - - if ("meta".equalsIgnoreCase(node.getNodeName())) { - NamedNodeMap attrs = node.getAttributes(); - Node nameNode = null; - Node equivNode = null; - Node contentNode = null; - // Retrieves name, http-equiv and content attribues - for (int i = 0; i < attrs.getLength(); i++) { - Node attr = attrs.item(i); - String attrName = attr.getNodeName().toLowerCase(); - if (attrName.equals("name")) { - nameNode = attr; - } else if (attrName.equals("http-equiv")) { - equivNode = attr; - } else if (attrName.equals("content")) { - contentNode = attr; - } - } - - if (nameNode != null) { - if (contentNode != null) { - String name = nameNode.getNodeValue().toLowerCase(); - metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); - if ("robots".equals(name)) { - - if (contentNode != null) { - String directives = contentNode.getNodeValue().toLowerCase(); - int index = directives.indexOf("none"); - - if (index >= 0) { - metaTags.setNoIndex(); - metaTags.setNoFollow(); - } - - index = directives.indexOf("all"); - if (index >= 0) { - // do nothing... - } - - index = directives.indexOf("noindex"); - if (index >= 0) { - metaTags.setNoIndex(); - } - - index = directives.indexOf("nofollow"); - if (index >= 0) { - metaTags.setNoFollow(); - } - - index = directives.indexOf("noarchive"); - if (index >= 0) { - metaTags.setNoCache(); - } - } - - } // end if (name == robots) - } - } - - if (equivNode != null) { - if (contentNode != null) { - String name = equivNode.getNodeValue().toLowerCase(); - String content = contentNode.getNodeValue(); - metaTags.getHttpEquivTags().setProperty(name, content); - if ("pragma".equals(name)) { - content = content.toLowerCase(); - int index = content.indexOf("no-cache"); - if (index >= 0) - metaTags.setNoCache(); - } else if ("refresh".equals(name)) { - int idx = content.indexOf(';'); - String time = null; - if (idx == -1) { // just the refresh time - time = content; - } else - time = content.substring(0, idx); - try { - metaTags.setRefreshTime(Integer.parseInt(time)); - // skip this if we couldn't parse the time - metaTags.setRefresh(true); - } catch (Exception e) { - ; - } - URL refreshUrl = null; - if (metaTags.getRefresh() && idx != -1) { // set the URL - idx = content.toLowerCase().indexOf("url="); - if (idx == -1) { // assume a mis-formatted entry with just the - // url - idx = content.indexOf(';') + 1; - } else - idx += 4; - if (idx != -1) { - String url = content.substring(idx); - try { - refreshUrl = new URL(url); - } catch (Exception e) { - // XXX according to the spec, this has to be an absolute - // XXX url. However, many websites use relative URLs and - // XXX expect browsers to handle that. - // XXX Unfortunately, in some cases this may create a - // XXX infinitely recursive paths (a crawler trap)... - // if (!url.startsWith("/")) url = "/" + url; - try { - refreshUrl = new URL(currURL, url); - } catch (Exception e1) { - refreshUrl = null; - } - } - } - } - if (metaTags.getRefresh()) { - if (refreshUrl == null) { - // apparently only refresh time was present. set the URL - // to the same URL. - refreshUrl = currURL; - } - metaTags.setRefreshHref(refreshUrl); - } - } - } - } - - } else if ("base".equalsIgnoreCase(node.getNodeName())) { - NamedNodeMap attrs = node.getAttributes(); - Node hrefNode = attrs.getNamedItem("href"); - - if (hrefNode != null) { - String urlString = hrefNode.getNodeValue(); - - URL url = null; - try { - if (currURL == null) - url = new URL(urlString); - else - url = new URL(currURL, urlString); - } catch (Exception e) { - ; - } - - if (url != null) - metaTags.setBaseHref(url); - } - - } - - } - - NodeList children = node.getChildNodes(); - if (children != null) { - int len = children.getLength(); - for (int i = 0; i < len; i++) { - getMetaTagsHelper(metaTags, children.item(i), currURL); - } - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java deleted file mode 100644 index 5d7eca9..0000000 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parse.tika; - -import java.io.ByteArrayInputStream; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.html.dom.HTMLDocumentImpl; -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.HtmlParseFilters; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.OutlinkExtractor; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.protocol.Content; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.html.BoilerpipeContentHandler; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.html.HtmlMapper; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.sax.Link; -import org.apache.tika.sax.LinkContentHandler; -import org.apache.tika.sax.TeeContentHandler; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.DocumentFragment; -import org.xml.sax.ContentHandler; - -/** - * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML - * representation returned by Tika as SAX events - ***/ - -public class TikaParser implements org.apache.nutch.parse.Parser { - - public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class); - - private Configuration conf; - private TikaConfig tikaConfig = null; - private DOMContentUtils utils; - private HtmlParseFilters htmlParseFilters; - private String cachingPolicy; - private HtmlMapper HTMLMapper; - private boolean upperCaseElementNames = true; - - @SuppressWarnings("deprecation") - public ParseResult getParse(Content content) { - String mimeType = content.getContentType(); - - boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe"); - String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor"); - - URL base; - try { - base = new URL(content.getBaseUrl()); - } catch (MalformedURLException e) { - return new ParseStatus(e) - .getEmptyParseResult(content.getUrl(), getConf()); - } - - // get the right parser using the mime type as a clue - Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); - byte[] raw = content.getContent(); - - if (parser == null) { - String message = "Can't retrieve Tika parser for mime-type " + mimeType; - LOG.error(message); - return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult( - content.getUrl(), getConf()); - } - - LOG.debug("Using Tika parser " + parser.getClass().getName() - + " for mime-type " + mimeType); - - Metadata tikamd = new Metadata(); - - HTMLDocumentImpl doc = new HTMLDocumentImpl(); - doc.setErrorChecking(false); - DocumentFragment root = doc.createDocumentFragment(); - - ContentHandler domHandler; - - // Check whether to use Tika's BoilerplateContentHandler - if (useBoilerpipe) { - BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), - BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); - bpHandler.setIncludeMarkup(true); - domHandler = (ContentHandler)bpHandler; - } else { - DOMBuilder domBuilder = new DOMBuilder(doc, root); - domBuilder.setUpperCaseElementNames(upperCaseElementNames); - domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML); - domHandler = (ContentHandler)domBuilder; - } - - LinkContentHandler linkContentHandler = new LinkContentHandler(); - - ParseContext context = new ParseContext(); - TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler); - - if (HTMLMapper != null) - context.set(HtmlMapper.class, HTMLMapper); - tikamd.set(Metadata.CONTENT_TYPE, mimeType); - try { - parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context); - } catch (Exception e) { - LOG.error("Error parsing " + content.getUrl(), e); - return new ParseStatus(ParseStatus.FAILED, e.getMessage()) - .getEmptyParseResult(content.getUrl(), getConf()); - } - - HTMLMetaTags metaTags = new HTMLMetaTags(); - String text = ""; - String title = ""; - Outlink[] outlinks = new Outlink[0]; - org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); - - // we have converted the sax events generated by Tika into a DOM object - // so we can now use the usual HTML resources from Nutch - // get meta directives - HTMLMetaProcessor.getMetaTags(metaTags, root, base); - if (LOG.isTraceEnabled()) { - LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); - } - - // check meta directives - if (!metaTags.getNoIndex()) { // okay to index - StringBuffer sb = new StringBuffer(); - if (LOG.isTraceEnabled()) { - LOG.trace("Getting text..."); - } - utils.getText(sb, root); // extract text - text = sb.toString(); - sb.setLength(0); - if (LOG.isTraceEnabled()) { - LOG.trace("Getting title..."); - } - utils.getTitle(sb, root); // extract title - title = sb.toString().trim(); - } - - if (!metaTags.getNoFollow()) { // okay to follow links - ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks - URL baseTag = utils.getBase(root); - if (LOG.isTraceEnabled()) { - LOG.trace("Getting links..."); - } - - // pre-1233 outlink extraction - //utils.getOutlinks(baseTag != null ? baseTag : base, l, root); - // Get outlinks from Tika - List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks(); - utils.getOutlinks(baseTag != null ? baseTag : base, l, tikaExtractedOutlinks); - outlinks = l.toArray(new Outlink[l.size()]); - if (LOG.isTraceEnabled()) { - LOG.trace("found " + outlinks.length + " outlinks in " - + content.getUrl()); - } - } - - // populate Nutch metadata with Tika metadata - String[] TikaMDNames = tikamd.names(); - for (String tikaMDName : TikaMDNames) { - if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) - continue; - String[] values = tikamd.getValues(tikaMDName); - for (String v : values) - nutchMetadata.add(tikaMDName, v); - } - - // no outlinks? try OutlinkExtractor e.g works for mime types where no - // explicit markup for anchors - - if (outlinks.length == 0) { - outlinks = OutlinkExtractor.getOutlinks(text, getConf()); - } - - ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); - if (metaTags.getRefresh()) { - status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); - status.setArgs(new String[] { metaTags.getRefreshHref().toString(), - Integer.toString(metaTags.getRefreshTime()) }); - } - ParseData parseData = new ParseData(status, title, outlinks, - content.getMetadata(), nutchMetadata); - ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), - new ParseImpl(text, parseData)); - - // run filters on parse - ParseResult filteredParse = this.htmlParseFilters.filter(content, - parseResult, metaTags, root); - if (metaTags.getNoCache()) { // not okay to cache - for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) - entry.getValue().getData().getParseMeta() - .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); - } - return filteredParse; - } - - public void setConf(Configuration conf) { - this.conf = conf; - this.tikaConfig = null; - - // do we want a custom Tika configuration file - // deprecated since Tika 0.7 which is based on - // a service provider based configuration - String customConfFile = conf.get("tika.config.file"); - if (customConfFile != null) { - try { - // see if a Tika config file can be found in the job file - URL customTikaConfig = conf.getResource(customConfFile); - if (customTikaConfig != null) - tikaConfig = new TikaConfig(customTikaConfig); - } catch (Exception e1) { - String message = "Problem loading custom Tika configuration from " - + customConfFile; - LOG.error(message, e1); - } - } else { - try { - tikaConfig = new TikaConfig(this.getClass().getClassLoader()); - } catch (Exception e2) { - String message = "Problem loading default Tika configuration"; - LOG.error(message, e2); - } - } - - // use a custom htmlmapper - String htmlmapperClassName = conf.get("tika.htmlmapper.classname"); - if (StringUtils.isNotBlank(htmlmapperClassName)) { - try { - Class HTMLMapperClass = Class.forName(htmlmapperClassName); - boolean interfaceOK = HtmlMapper.class - .isAssignableFrom(HTMLMapperClass); - if (!interfaceOK) { - throw new RuntimeException("Class " + htmlmapperClassName - + " does not implement HtmlMapper"); - } - HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance(); - } catch (Exception e) { - LOG.error("Can't generate instance for class " + htmlmapperClassName); - throw new RuntimeException("Can't generate instance for class " - + htmlmapperClassName); - } - } - - this.htmlParseFilters = new HtmlParseFilters(getConf()); - this.utils = new DOMContentUtils(conf); - this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", - Nutch.CACHING_FORBIDDEN_CONTENT); - this.upperCaseElementNames = getConf().getBoolean( - "tika.uppercase.element.names", true); - } - - public Configuration getConf() { - return this.conf; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java deleted file mode 100644 index d625c33..0000000 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0 - * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer, - * XXX in order to avoid dependency on Xalan. - */ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $ - */ -package org.apache.nutch.parse.tika; - -/** - * Class used to verify whether the specified <var>ch</var> conforms to the XML - * 1.0 definition of whitespace. - */ -class XMLCharacterRecognizer { - - /** - * Returns whether the specified <var>ch</var> conforms to the XML 1.0 - * definition of whitespace. Refer to <A - * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of - * <CODE>S</CODE></A> for details. - * - * @param ch - * Character to check as XML whitespace. - * @return =true if <var>ch</var> is XML whitespace; otherwise =false. - */ - static boolean isWhiteSpace(char ch) { - return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA); - } - - /** - * Tell if the string is whitespace. - * - * @param ch - * Character array to check as XML whitespace. - * @param start - * Start index of characters in the array - * @param length - * Number of characters in the array - * @return True if the characters in the array are XML whitespace; otherwise, - * false. - */ - static boolean isWhiteSpace(char ch[], int start, int length) { - - int end = start + length; - - for (int s = start; s < end; s++) { - if (!isWhiteSpace(ch[s])) - return false; - } - - return true; - } - - /** - * Tell if the string is whitespace. - * - * @param buf - * StringBuffer to check as XML whitespace. - * @return True if characters in buffer are XML whitespace, false otherwise - */ - static boolean isWhiteSpace(StringBuffer buf) { - - int n = buf.length(); - - for (int i = 0; i < n; i++) { - if (!isWhiteSpace(buf.charAt(i))) - return false; - } - - return true; - } - - /** - * Tell if the string is whitespace. - * - * @param s - * String to check as XML whitespace. - * @return True if characters in buffer are XML whitespace, false otherwise - */ - static boolean isWhiteSpace(String s) { - - if (null != s) { - int n = s.length(); - - for (int i = 0; i < n; i++) { - if (!isWhiteSpace(s.charAt(i))) - return false; - } - } - - return true; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java deleted file mode 100644 index 19e3f47..0000000 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse various document formats with help of - * <a href="http://tika.apache.org/">Apache Tika</a>. - */ -package org.apache.nutch.parse.tika; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java deleted file mode 100644 index 96029a6..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java +++ /dev/null @@ -1,337 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.tika.DOMContentUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; - -import java.io.ByteArrayInputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.StringTokenizer; - -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; -import org.cyberneko.html.parsers.DOMFragmentParser; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * Unit tests for DOMContentUtils. - */ -public class TestDOMContentUtils { - - private static final String[] testPages = { - - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" + "</body></html>"), - - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" - + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" - + "</body></html>"), - - new String("<html><head><title> </title>" + "</head><body> " - + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" - + "</a></a>" + "</body></html>"), - - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" - + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" - + "</body></html>"), - - // test frameset link extraction. The invalid frame in the middle - // will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" - + "</frame>" + "<frameset cols=\"20,*\">" - + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" - + "</frame>" + "<frame src=\"right.html\">" + "</frame>" - + "</frameset>" + "</frameset>" + "</body></html>"), - - // test <area> and <iframe> link extraction + url normalization - new String( - "<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), - - // test whitespace processing for plain text extraction - new String( - "<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." + "</body> </html>"), - - // test that <a rel=nofollow> links are not returned - new String("<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // test that POST form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // test that all form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" - + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" - + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" - + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; - - private static int SKIP = 9; - - private static String[] testBaseHrefs = { "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", "http://www.nutch.org/", - "http://www.nutch.org/", "http://www.nutch.org/", - "http://www.nutch.org/;something" }; - - private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; - - private static URL[] testBaseHrefURLs = new URL[testPages.length]; - - private static final String[] answerText = { - "title body anchor", - "title body home bots", - "separate this from this", - "my title body home 1 2", - "my title", - "my title the bottom", - "my title Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", "ignore ignore", "test1 test2", - "test1 test2", "title anchor1 anchor2 anchor3", - "title anchor1 anchor2 anchor3 anchor4 anchor5" }; - - private static final String[] answerTitle = { "title", "title", "", - "my title", "my title", "my title", "my title", "", "", "", "title", - "title" }; - - // note: should be in page-order - private static Outlink[][] answerOutlinks; - - private static Configuration conf; - private static DOMContentUtils utils = null; - - @Before - public void setup() throws Exception { - conf = NutchConfiguration.create(); - conf.setBoolean("parser.html.form.use_action", true); - utils = new DOMContentUtils(conf); - DOMFragmentParser parser = new DOMFragmentParser(); - parser.setFeature( - "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", - true); - for (int i = 0; i < testPages.length; i++) { - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - try { - parser.parse( - new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), - node); - testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); - } catch (Exception e) { - Assert.assertTrue("caught exception: " + e, false); - } - testDOMs[i] = node; - } - answerOutlinks = new Outlink[][] { - { new Outlink("http://www.nutch.org", "anchor"), }, - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, - { new Outlink("http://www.nutch.org/", "separate this"), - new Outlink("http://www.nutch.org/docs/ok", "from this"), }, - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/1", "1"), - new Outlink("http://www.nutch.org/docs/2", "2"), }, - { new Outlink("http://www.nutch.org/frames/top.html", ""), - new Outlink("http://www.nutch.org/frames/left.html", ""), - new Outlink("http://www.nutch.org/frames/invalid.html", ""), - new Outlink("http://www.nutch.org/frames/right.html", ""), }, - { new Outlink("http://www.nutch.org/maps/logo.gif", ""), - new Outlink("http://www.nutch.org/index.html", ""), - new Outlink("http://www.nutch.org/maps/#bottom", ""), - new Outlink("http://www.nutch.org/bot.html", ""), - new Outlink("http://www.nutch.org/docs/index.html", ""), }, - { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, - {}, - { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, - {}, - { new Outlink("http://www.nutch.org/;x", "anchor1"), - new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, - { - // this is tricky - see RFC3986 section 5.4.1 example 7 - new Outlink("http://www.nutch.org/g", "anchor1"), - new Outlink("http://www.nutch.org/g?y#s", "anchor2"), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), - new Outlink("http://www.nutch.org/;something?y=1;somethingelse", - "anchor5") } }; - - } - - private static boolean equalsIgnoreWhitespace(String s1, String s2) { - StringTokenizer st1 = new StringTokenizer(s1); - StringTokenizer st2 = new StringTokenizer(s2); - - while (st1.hasMoreTokens()) { - if (!st2.hasMoreTokens()) - return false; - if (!st1.nextToken().equals(st2.nextToken())) - return false; - } - if (st2.hasMoreTokens()) - return false; - return true; - } - - @Test - public void testGetText() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getText(sb, testDOMs[i]); - String text = sb.toString(); - Assert.assertTrue( - "expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerText[i], text)); - } - } - - @Test - public void testGetTitle() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getTitle(sb, testDOMs[i]); - String text = sb.toString(); - Assert.assertTrue( - "expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerTitle[i], text)); - } - } - - @Test - public void testGetOutlinks() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); - if (i == SKIP) { - conf.setBoolean("parser.html.form.use_action", false); - utils.setConf(conf); - } else { - conf.setBoolean("parser.html.form.use_action", true); - utils.setConf(conf); - } - utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); - Outlink[] outlinkArr = new Outlink[outlinks.size()]; - outlinkArr = outlinks.toArray(outlinkArr); - compareOutlinks(answerOutlinks[i], outlinkArr); - } - } - - private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { - for (int i = 0; i < o.length; i++) { - sb.append(o[i].toString()); - sb.append(System.getProperty("line.separator")); - } - } - - private static final String outlinksString(Outlink[] o) { - StringBuffer sb = new StringBuffer(); - appendOutlinks(sb, o); - return sb.toString(); - } - - private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { - if (o1.length != o2.length) { - Assert.assertTrue( - "got wrong number of outlinks (expecting " + o1.length + ", got " - + o2.length + ")" + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + outlinksString(o1) + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + outlinksString(o2) + System.getProperty("line.separator"), - false); - } - - for (int i = 0; i < o1.length; i++) { - if (!o1[i].equals(o2[i])) { - Assert.assertTrue( - "got wrong outlinks at position " + i - + System.getProperty("line.separator") + "answer: " - + System.getProperty("line.separator") + "'" + o1[i].getToUrl() - + "', anchor: '" + o1[i].getAnchor() + "'" - + System.getProperty("line.separator") + "got: " - + System.getProperty("line.separator") + "'" + o2[i].getToUrl() - + "', anchor: '" + o2[i].getAnchor() + "'", false); - } - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java deleted file mode 100644 index c9394dc..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.tika.TikaParser; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; - -/** - * - * @author mattmann / jnioche - * - * Test Suite for the RSS feeds with the {@link TikaParser}. - * - */ -public class TestFeedParser { - - private String fileSeparator = System.getProperty("file.separator"); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - private String[] sampleFiles = { "rsstest.rss" }; - - public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class - .getName()); - - /** - * <p> - * The test method: tests out the following 2 asserts: - * </p> - * - * <ul> - * <li>There are 3 outlinks read from the sample rss file</li> - * <li>The 3 outlinks read are in fact the correct outlinks from the sample - * file</li> - * </ul> - */ - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - Configuration conf = NutchConfiguration.create(); - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - // check that there are 2 outlinks: - // unlike the original parse-rss - // tika ignores the URL and description of the channel - - // http://test.channel.com - // http://www-scf.usc.edu/~mattmann/ - // http://www.nutch.org - - ParseData theParseData = parse.getData(); - - Outlink[] theOutlinks = theParseData.getOutlinks(); - - Assert.assertTrue("There aren't 2 outlinks read!", - theOutlinks.length == 2); - - // now check to make sure that those are the two outlinks - boolean hasLink1 = false, hasLink2 = false; - - for (int j = 0; j < theOutlinks.length; j++) { - if (theOutlinks[j].getToUrl().equals( - "http://www-scf.usc.edu/~mattmann/")) { - hasLink1 = true; - } - - if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) { - hasLink2 = true; - } - } - - if (!hasLink1 || !hasLink2) { - Assert.fail("Outlinks read from sample rss file are not correct!"); - } - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java deleted file mode 100644 index b1762e6..0000000 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Test; - -/** - * Test extraction of image metadata - */ -public class TestImageMetadata { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - private String[] sampleFiles = { "nutch_logo_tm.gif", }; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - Configuration conf = NutchConfiguration.create(); - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - Assert.assertEquals("121", parse.getData().getMeta("width")); - Assert.assertEquals("48", parse.getData().getMeta("height")); - } - } - -}
