http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
deleted file mode 100644
index 77a1044..0000000
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
+++ /dev/null
@@ -1,794 +0,0 @@
-/*
- * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0
- * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
- * avoid dependency on Xalan.
- */
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $
- */
-package org.apache.nutch.parse.tika;
-
-import java.util.Stack;
-
-import org.w3c.dom.Comment;
-import org.w3c.dom.Document;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.Text;
-import org.w3c.dom.CDATASection;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.Locator;
-import org.xml.sax.ext.LexicalHandler;
-
-/**
- * This class takes SAX events (in addition to some extra events that SAX
- * doesn't handle yet) and adds the result to a document or document fragment.
- */
-class DOMBuilder implements ContentHandler, LexicalHandler {
-  private boolean upperCaseElementNames = true;
-
-  /** Root document */
-  public Document m_doc;
-
-  /** Current node */
-  protected Node m_currentNode = null;
-
-  /** First node of document fragment or null if not a DocumentFragment */
-  public DocumentFragment m_docFrag = null;
-
-  /** Vector of element nodes */
-  protected Stack<Element> m_elemStack = new Stack<Element>();
-
-  /**
-  * Element recorded with this namespace will be converted to Node without a
-  * namespace
-  */
-  private String defaultNamespaceURI = null;
-
-  /**
-   * DOMBuilder instance constructor... it will add the DOM nodes to the
-   * document fragment.
-   * 
-   * @param doc
-   *          Root document
-   * @param node
-   *          Current node
-   */
-  DOMBuilder(Document doc, Node node) {
-    m_doc = doc;
-    m_currentNode = node;
-  }
-
-  /**
-   * DOMBuilder instance constructor... it will add the DOM nodes to the
-   * document fragment.
-   * 
-   * @param doc
-   *          Root document
-   * @param docFrag
-   *          Document fragment
-   */
-  DOMBuilder(Document doc, DocumentFragment docFrag) {
-    m_doc = doc;
-    m_docFrag = docFrag;
-  }
-
-  /**
-   * DOMBuilder instance constructor... it will add the DOM nodes to the
-   * document.
-   * 
-   * @param doc
-   *          Root document
-   */
-  DOMBuilder(Document doc) {
-    m_doc = doc;
-  }
-
-  /**
-   * Get the root node of the DOM being created. This is either a Document or a
-   * DocumentFragment.
-   * 
-   * @return The root document or document fragment if not null
-   */
-  Node getRootNode() {
-    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
-  }
-
-  /**
-   * Get the node currently being processed.
-   * 
-   * @return the current node being processed
-   */
-  Node getCurrentNode() {
-    return m_currentNode;
-  }
-
-  /**
-   * Return null since there is no Writer for this class.
-   * 
-   * @return null
-   */
-  java.io.Writer getWriter() {
-    return null;
-  }
-
-  /**
-   * Append a node to the current container.
-   * 
-   * @param newNode
-   *          New node to append
-   */
-  protected void append(Node newNode) throws org.xml.sax.SAXException {
-
-    Node currentNode = m_currentNode;
-
-    if (null != currentNode) {
-      currentNode.appendChild(newNode);
-
-      // System.out.println(newNode.getNodeName());
-    } else if (null != m_docFrag) {
-      m_docFrag.appendChild(newNode);
-    } else {
-      boolean ok = true;
-      short type = newNode.getNodeType();
-
-      if (type == Node.TEXT_NODE) {
-        String data = newNode.getNodeValue();
-
-        if ((null != data) && (data.trim().length() > 0)) {
-          throw new org.xml.sax.SAXException(
-              "Warning: can't output text before document element!  
Ignoring...");
-        }
-
-        ok = false;
-      } else if (type == Node.ELEMENT_NODE) {
-        if (m_doc.getDocumentElement() != null) {
-          throw new org.xml.sax.SAXException(
-              "Can't have more than one root on a DOM!");
-        }
-      }
-
-      if (ok)
-        m_doc.appendChild(newNode);
-    }
-  }
-
-  /**
-   * Receive an object for locating the origin of SAX document events.
-   * 
-   * <p>
-   * SAX parsers are strongly encouraged (though not absolutely required) to
-   * supply a locator: if it does so, it must supply the locator to the
-   * application by invoking this method before invoking any of the other
-   * methods in the ContentHandler interface.
-   * </p>
-   * 
-   * <p>
-   * The locator allows the application to determine the end position of any
-   * document-related event, even if the parser is not reporting an error.
-   * Typically, the application will use this information for reporting its own
-   * errors (such as character content that does not match an application's
-   * business rules). The information returned by the locator is probably not
-   * sufficient for use with a search engine.
-   * </p>
-   * 
-   * <p>
-   * Note that the locator will return correct information only during the
-   * invocation of the events in this interface. The application should not
-   * attempt to use it at any other time.
-   * </p>
-   * 
-   * @param locator
-   *          An object that can return the location of any SAX document event.
-   * @see org.xml.sax.Locator
-   */
-  public void setDocumentLocator(Locator locator) {
-
-    // No action for the moment.
-  }
-
-  /**
-   * Receive notification of the beginning of a document.
-   * 
-   * <p>
-   * The SAX parser will invoke this method only once, before any other methods
-   * in this interface or in DTDHandler (except for setDocumentLocator).
-   * </p>
-   */
-  public void startDocument() throws org.xml.sax.SAXException {
-
-    // No action for the moment.
-  }
-
-  /**
-   * Receive notification of the end of a document.
-   * 
-   * <p>
-   * The SAX parser will invoke this method only once, and it will be the last
-   * method invoked during the parse. The parser shall not invoke this method
-   * until it has either abandoned parsing (because of an unrecoverable error)
-   * or reached the end of input.
-   * </p>
-   */
-  public void endDocument() throws org.xml.sax.SAXException {
-
-    // No action for the moment.
-  }
-
-  /**
-   * Receive notification of the beginning of an element.
-   * 
-   * <p>
-   * The Parser will invoke this method at the beginning of every element in 
the
-   * XML document; there will be a corresponding endElement() event for every
-   * startElement() event (even when the element is empty). All of the 
element's
-   * content will be reported, in order, before the corresponding endElement()
-   * event.
-   * </p>
-   * 
-   * <p>
-   * If the element name has a namespace prefix, the prefix will still be
-   * attached. Note that the attribute list provided will contain only
-   * attributes with explicit values (specified or defaulted): #IMPLIED
-   * attributes will be omitted.
-   * </p>
-   * 
-   * 
-   * @param ns
-   *          The namespace of the node
-   * @param localName
-   *          The local part of the qualified name
-   * @param name
-   *          The element name.
-   * @param atts
-   *          The attributes attached to the element, if any.
-   * @see #endElement
-   * @see org.xml.sax.Attributes
-   */
-  public void startElement(String ns, String localName, String name,
-      Attributes atts) throws org.xml.sax.SAXException {
-
-    Element elem;
-
-    if (upperCaseElementNames)
-      name = name.toUpperCase();
-
-    // Note that the namespace-aware call must be used to correctly
-    // construct a Level 2 DOM, even for non-namespaced nodes.
-    if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI))
-      elem = m_doc.createElementNS(null, name);
-    else
-      elem = m_doc.createElementNS(ns, name);
-
-    append(elem);
-
-    try {
-      int nAtts = atts.getLength();
-
-      if (0 != nAtts) {
-        for (int i = 0; i < nAtts; i++) {
-
-          // System.out.println("type " + atts.getType(i) + " name " +
-          // atts.getLocalName(i) );
-          // First handle a possible ID attribute
-          if (atts.getType(i).equalsIgnoreCase("ID"))
-            setIDAttribute(atts.getValue(i), elem);
-
-          String attrNS = atts.getURI(i);
-
-          if ("".equals(attrNS))
-            attrNS = null; // DOM represents no-namespace as null
-
-          // System.out.println("attrNS: "+attrNS+", localName: 
"+atts.getQName(i)
-          // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
-          // Crimson won't let us set an xmlns: attribute on the DOM.
-          String attrQName = atts.getQName(i);
-
-          // In SAX, xmlns: attributes have an empty namespace, while in DOM
-          // they should have the xmlns namespace
-          if (attrQName.startsWith("xmlns:"))
-            attrNS = "http://www.w3.org/2000/xmlns/";;
-
-          // ALWAYS use the DOM Level 2 call!
-          elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
-        }
-      }
-
-      // append(elem);
-
-      m_elemStack.push(elem);
-
-      m_currentNode = elem;
-
-      // append(elem);
-    } catch (java.lang.Exception de) {
-      // de.printStackTrace();
-      throw new org.xml.sax.SAXException(de);
-    }
-
-  }
-
-  /**
-   * 
-   * 
-   * 
-   * Receive notification of the end of an element.
-   * 
-   * <p>
-   * The SAX parser will invoke this method at the end of every element in the
-   * XML document; there will be a corresponding startElement() event for every
-   * endElement() event (even when the element is empty).
-   * </p>
-   * 
-   * <p>
-   * If the element name has a namespace prefix, the prefix will still be
-   * attached to the name.
-   * </p>
-   * 
-   * 
-   * @param ns
-   *          the namespace of the element
-   * @param localName
-   *          The local part of the qualified name of the element
-   * @param name
-   *          The element name
-   */
-  public void endElement(String ns, String localName, String name)
-      throws org.xml.sax.SAXException {
-    if (!m_elemStack.isEmpty()) {
-      m_elemStack.pop();
-    }
-    m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
-  }
-
-  /**
-   * Set an ID string to node association in the ID table.
-   * 
-   * @param id
-   *          The ID string.
-   * @param elem
-   *          The associated ID.
-   */
-  public void setIDAttribute(String id, Element elem) {
-
-    // Do nothing. This method is meant to be overiden.
-  }
-
-  /**
-   * Receive notification of character data.
-   * 
-   * <p>
-   * The Parser will call this method to report each chunk of character data.
-   * SAX parsers may return all contiguous character data in a single chunk, or
-   * they may split it into several chunks; however, all of the characters in
-   * any single event must come from the same external entity, so that the
-   * Locator provides useful information.
-   * </p>
-   * 
-   * <p>
-   * The application must not attempt to read from the array outside of the
-   * specified range.
-   * </p>
-   * 
-   * <p>
-   * Note that some parsers will report whitespace using the
-   * ignorableWhitespace() method rather than this one (validating parsers must
-   * do so).
-   * </p>
-   * 
-   * @param ch
-   *          The characters from the XML document.
-   * @param start
-   *          The start position in the array.
-   * @param length
-   *          The number of characters to read from the array.
-   * @see #ignorableWhitespace
-   * @see org.xml.sax.Locator
-   */
-  public void characters(char ch[], int start, int length)
-      throws org.xml.sax.SAXException {
-    if (isOutsideDocElem()
-        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
-      return; // avoid DOM006 Hierarchy request error
-
-    if (m_inCData) {
-      cdata(ch, start, length);
-
-      return;
-    }
-
-    String s = new String(ch, start, length);
-    Node childNode;
-    childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
-    if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
-      ((Text) childNode).appendData(s);
-    } else {
-      Text text = m_doc.createTextNode(s);
-      append(text);
-    }
-  }
-
-  /**
-   * If available, when the disable-output-escaping attribute is used, output
-   * raw text without escaping. A PI will be inserted in front of the node with
-   * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
-   * 
-   * @param ch
-   *          Array containing the characters
-   * @param start
-   *          Index to start of characters in the array
-   * @param length
-   *          Number of characters in the array
-   */
-  public void charactersRaw(char ch[], int start, int length)
-      throws org.xml.sax.SAXException {
-    if (isOutsideDocElem()
-        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
-      return; // avoid DOM006 Hierarchy request error
-
-    String s = new String(ch, start, length);
-
-    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
-        "formatter-to-dom"));
-    append(m_doc.createTextNode(s));
-  }
-
-  /**
-   * Report the beginning of an entity.
-   * 
-   * The start and end of the document entity are not reported. The start and
-   * end of the external DTD subset are reported using the pseudo-name "[dtd]".
-   * All other events must be properly nested within start/end entity events.
-   * 
-   * @param name
-   *          The name of the entity. If it is a parameter entity, the name 
will
-   *          begin with '%'.
-   * @see #endEntity
-   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
-   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
-   */
-  public void startEntity(String name) throws org.xml.sax.SAXException {
-
-    // Almost certainly the wrong behavior...
-    // entityReference(name);
-  }
-
-  /**
-   * Report the end of an entity.
-   * 
-   * @param name
-   *          The name of the entity that is ending.
-   * @see #startEntity
-   */
-  public void endEntity(String name) throws org.xml.sax.SAXException {
-  }
-
-  /**
-   * Receive notivication of a entityReference.
-   * 
-   * @param name
-   *          name of the entity reference
-   */
-  public void entityReference(String name) throws org.xml.sax.SAXException {
-    append(m_doc.createEntityReference(name));
-  }
-
-  /**
-   * Receive notification of ignorable whitespace in element content.
-   * 
-   * <p>
-   * Validating Parsers must use this method to report each chunk of ignorable
-   * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
-   * non-validating parsers may also use this method if they are capable of
-   * parsing and using content models.
-   * </p>
-   * 
-   * <p>
-   * SAX parsers may return all contiguous whitespace in a single chunk, or 
they
-   * may split it into several chunks; however, all of the characters in any
-   * single event must come from the same external entity, so that the Locator
-   * provides useful information.
-   * </p>
-   * 
-   * <p>
-   * The application must not attempt to read from the array outside of the
-   * specified range.
-   * </p>
-   * 
-   * @param ch
-   *          The characters from the XML document.
-   * @param start
-   *          The start position in the array.
-   * @param length
-   *          The number of characters to read from the array.
-   * @see #characters
-   */
-  public void ignorableWhitespace(char ch[], int start, int length)
-      throws org.xml.sax.SAXException {
-    if (isOutsideDocElem())
-      return; // avoid DOM006 Hierarchy request error
-
-    String s = new String(ch, start, length);
-
-    append(m_doc.createTextNode(s));
-  }
-
-  /**
-   * Tell if the current node is outside the document element.
-   * 
-   * @return true if the current node is outside the document element.
-   */
-  private boolean isOutsideDocElem() {
-    return (null == m_docFrag)
-        && m_elemStack.size() == 0
-        && (null == m_currentNode || m_currentNode.getNodeType() == 
Node.DOCUMENT_NODE);
-  }
-
-  /**
-   * Receive notification of a processing instruction.
-   * 
-   * <p>
-   * The Parser will invoke this method once for each processing instruction
-   * found: note that processing instructions may occur before or after the 
main
-   * document element.
-   * </p>
-   * 
-   * <p>
-   * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
-   * or a text declaration (XML 1.0, section 4.3.1) using this method.
-   * </p>
-   * 
-   * @param target
-   *          The processing instruction target.
-   * @param data
-   *          The processing instruction data, or null if none was supplied.
-   */
-  public void processingInstruction(String target, String data)
-      throws org.xml.sax.SAXException {
-    append(m_doc.createProcessingInstruction(target, data));
-  }
-
-  /**
-   * Report an XML comment anywhere in the document.
-   * 
-   * This callback will be used for comments inside or outside the document
-   * element, including comments in the external DTD subset (if read).
-   * 
-   * @param ch
-   *          An array holding the characters in the comment.
-   * @param start
-   *          The starting position in the array.
-   * @param length
-   *          The number of characters to use from the array.
-   */
-  public void comment(char ch[], int start, int length)
-      throws org.xml.sax.SAXException {
-    // tagsoup sometimes submits invalid values here
-    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
-      return;
-    append(m_doc.createComment(new String(ch, start, length)));
-  }
-
-  /** Flag indicating that we are processing a CData section */
-  protected boolean m_inCData = false;
-
-  /**
-   * Report the start of a CDATA section.
-   * 
-   * @see #endCDATA
-   */
-  public void startCDATA() throws org.xml.sax.SAXException {
-    m_inCData = true;
-    append(m_doc.createCDATASection(""));
-  }
-
-  /**
-   * Report the end of a CDATA section.
-   * 
-   * @see #startCDATA
-   */
-  public void endCDATA() throws org.xml.sax.SAXException {
-    m_inCData = false;
-  }
-
-  /**
-   * Receive notification of cdata.
-   * 
-   * <p>
-   * The Parser will call this method to report each chunk of character data.
-   * SAX parsers may return all contiguous character data in a single chunk, or
-   * they may split it into several chunks; however, all of the characters in
-   * any single event must come from the same external entity, so that the
-   * Locator provides useful information.
-   * </p>
-   * 
-   * <p>
-   * The application must not attempt to read from the array outside of the
-   * specified range.
-   * </p>
-   * 
-   * <p>
-   * Note that some parsers will report whitespace using the
-   * ignorableWhitespace() method rather than this one (validating parsers must
-   * do so).
-   * </p>
-   * 
-   * @param ch
-   *          The characters from the XML document.
-   * @param start
-   *          The start position in the array.
-   * @param length
-   *          The number of characters to read from the array.
-   * @see #ignorableWhitespace
-   * @see org.xml.sax.Locator
-   */
-  public void cdata(char ch[], int start, int length)
-      throws org.xml.sax.SAXException {
-    if (isOutsideDocElem()
-        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
-      return; // avoid DOM006 Hierarchy request error
-
-    String s = new String(ch, start, length);
-
-    // XXX [email protected]: modified from the original, to accomodate TagSoup.
-    Node n = m_currentNode.getLastChild();
-    if (n instanceof CDATASection)
-      ((CDATASection) n).appendData(s);
-    else if (n instanceof Comment)
-      ((Comment) n).appendData(s);
-  }
-
-  /**
-   * Report the start of DTD declarations, if any.
-   * 
-   * Any declarations are assumed to be in the internal subset unless otherwise
-   * indicated.
-   * 
-   * @param name
-   *          The document type name.
-   * @param publicId
-   *          The declared public identifier for the external DTD subset, or
-   *          null if none was declared.
-   * @param systemId
-   *          The declared system identifier for the external DTD subset, or
-   *          null if none was declared.
-   * @see #endDTD
-   * @see #startEntity
-   */
-  public void startDTD(String name, String publicId, String systemId)
-      throws org.xml.sax.SAXException {
-
-    // Do nothing for now.
-  }
-
-  /**
-   * Report the end of DTD declarations.
-   * 
-   * @see #startDTD
-   */
-  public void endDTD() throws org.xml.sax.SAXException {
-
-    // Do nothing for now.
-  }
-
-  /**
-   * Begin the scope of a prefix-URI Namespace mapping.
-   * 
-   * <p>
-   * The information from this event is not necessary for normal Namespace
-   * processing: the SAX XML reader will automatically replace prefixes for
-   * element and attribute names when the 
http://xml.org/sax/features/namespaces
-   * feature is true (the default).
-   * </p>
-   * 
-   * <p>
-   * There are cases, however, when applications need to use prefixes in
-   * character data or in attribute values, where they cannot safely be 
expanded
-   * automatically; the start/endPrefixMapping event supplies the information 
to
-   * the application to expand prefixes in those contexts itself, if necessary.
-   * </p>
-   * 
-   * <p>
-   * Note that start/endPrefixMapping events are not guaranteed to be properly
-   * nested relative to each-other: all startPrefixMapping events will occur
-   * before the corresponding startElement event, and all endPrefixMapping
-   * events will occur after the corresponding endElement event, but their 
order
-   * is not guaranteed.
-   * </p>
-   * 
-   * @param prefix
-   *          The Namespace prefix being declared.
-   * @param uri
-   *          The Namespace URI the prefix is mapped to.
-   * @see #endPrefixMapping
-   * @see #startElement
-   */
-  public void startPrefixMapping(String prefix, String uri)
-      throws org.xml.sax.SAXException {
-
-    /*
-     * // Not sure if this is needed or wanted // Also, it fails in the stree.
-     * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
-     * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
-     * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
-     * = "xmlns:"+prefix;
-     * 
-     * Element elem = (Element)m_currentNode; String val =
-     * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == 
null)
-     * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace";, qname,
-     * uri); } }
-     */
-  }
-
-  /**
-   * End the scope of a prefix-URI mapping.
-   * 
-   * <p>
-   * See startPrefixMapping for details. This event will always occur after the
-   * corresponding endElement event, but the order of endPrefixMapping events 
is
-   * not otherwise guaranteed.
-   * </p>
-   * 
-   * @param prefix
-   *          The prefix that was being mapping.
-   * @see #startPrefixMapping
-   * @see #endElement
-   */
-  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
-  }
-
-  /**
-   * Receive notification of a skipped entity.
-   * 
-   * <p>
-   * The Parser will invoke this method once for each entity skipped.
-   * Non-validating processors may skip entities if they have not seen the
-   * declarations (because, for example, the entity was declared in an external
-   * DTD subset). All processors may skip external entities, depending on the
-   * values of the http://xml.org/sax/features/external-general-entities and 
the
-   * http://xml.org/sax/features/external-parameter-entities properties.
-   * </p>
-   * 
-   * @param name
-   *          The name of the skipped entity. If it is a parameter entity, the
-   *          name will begin with '%'.
-   */
-  public void skippedEntity(String name) throws org.xml.sax.SAXException {
-  }
-
-  public boolean isUpperCaseElementNames() {
-    return upperCaseElementNames;
-  }
-
-  public void setUpperCaseElementNames(boolean upperCaseElementNames) {
-    this.upperCaseElementNames = upperCaseElementNames;
-  }
- 
-  public String getDefaultNamespaceURI() {
-    return defaultNamespaceURI;
-  }
-
-  public void setDefaultNamespaceURI(String defaultNamespaceURI) {
-    this.defaultNamespaceURI = defaultNamespaceURI;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
deleted file mode 100644
index 5c4c990..0000000
--- 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ /dev/null
@@ -1,402 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.tika;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.util.NodeWalker;
-import org.apache.nutch.util.URLUtil;
-import org.apache.tika.sax.Link;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-/**
- * A collection of methods for extracting content from DOM trees.
- * 
- * This class holds a few utility methods for pulling content out of DOM nodes,
- * such as getOutlinks, getText, etc.
- * 
- */
-public class DOMContentUtils {
-
-  private static class LinkParams {
-    private String elName;
-    private String attrName;
-    private int childLen;
-
-    private LinkParams(String elName, String attrName, int childLen) {
-      this.elName = elName;
-      this.attrName = attrName;
-      this.childLen = childLen;
-    }
-
-    public String toString() {
-      return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + 
"]";
-    }
-  }
-
-  private HashMap<String, LinkParams> linkParams = new HashMap<String, 
LinkParams>();
-  private HashSet<String> ignoredTags = new HashSet<String>();
-  private Configuration conf;
-
-  public DOMContentUtils(Configuration conf) {
-    setConf(conf);
-  }
-
-  public void setConf(Configuration conf) {
-    // forceTags is used to override configurable tag ignoring, later on
-    Collection<String> forceTags = new ArrayList<String>(1);
-
-    this.conf = conf;
-    linkParams.clear();
-    linkParams.put("a", new LinkParams("a", "href", 1));
-    linkParams.put("area", new LinkParams("area", "href", 0));
-    if (conf.getBoolean("parser.html.form.use_action", true)) {
-      linkParams.put("form", new LinkParams("form", "action", 1));
-      if (conf.get("parser.html.form.use_action") != null)
-        forceTags.add("form");
-    }
-    linkParams.put("frame", new LinkParams("frame", "src", 0));
-    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
-    linkParams.put("script", new LinkParams("script", "src", 0));
-    linkParams.put("link", new LinkParams("link", "href", 0));
-    linkParams.put("img", new LinkParams("img", "src", 0));
-
-    // remove unwanted link tags from the linkParams map
-    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
-    for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
-      ignoredTags.add(ignoreTags[i].toLowerCase());
-      if (!forceTags.contains(ignoreTags[i]))
-        linkParams.remove(ignoreTags[i]);
-    }
-  }
-
-  /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
-   * append all the content text found beneath the DOM node to the
-   * <code>StringBuffer</code>.
-   * 
-   * <p>
-   * 
-   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be 
aborted
-   * and the <code>StringBuffer</code> will not contain any text encountered
-   * after a nested anchor is found.
-   * 
-   * <p>
-   * 
-   * @return true if nested anchors were found
-   */
-  private boolean getText(StringBuffer sb, Node node,
-      boolean abortOnNestedAnchors) {
-    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
-      return true;
-    }
-    return false;
-  }
-
-  /**
-   * This is a convinience method, equivalent to
-   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
-   * 
-   */
-  public void getText(StringBuffer sb, Node node) {
-    getText(sb, node, false);
-  }
-
-  // returns true if abortOnNestedAnchors is true and we find nested
-  // anchors
-  private boolean getTextHelper(StringBuffer sb, Node node,
-      boolean abortOnNestedAnchors, int anchorDepth) {
-    boolean abort = false;
-    NodeWalker walker = new NodeWalker(node);
-
-    while (walker.hasNext()) {
-
-      Node currentNode = walker.nextNode();
-      String nodeName = currentNode.getNodeName();
-      short nodeType = currentNode.getNodeType();
-
-      if ("script".equalsIgnoreCase(nodeName)) {
-        walker.skipChildren();
-      }
-      if ("style".equalsIgnoreCase(nodeName)) {
-        walker.skipChildren();
-      }
-      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
-        anchorDepth++;
-        if (anchorDepth > 1) {
-          abort = true;
-          break;
-        }
-      }
-      if (nodeType == Node.COMMENT_NODE) {
-        walker.skipChildren();
-      }
-      if (nodeType == Node.TEXT_NODE) {
-        // cleanup and trim the value
-        String text = currentNode.getNodeValue();
-        text = text.replaceAll("\\s+", " ");
-        text = text.trim();
-        if (text.length() > 0) {
-          if (sb.length() > 0)
-            sb.append(' ');
-          sb.append(text);
-        }
-      }
-    }
-
-    return abort;
-  }
-
-  /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
-   * append the content text found beneath the first <code>title</code> node to
-   * the <code>StringBuffer</code>.
-   * 
-   * @return true if a title node was found, false otherwise
-   */
-  public boolean getTitle(StringBuffer sb, Node node) {
-
-    NodeWalker walker = new NodeWalker(node);
-
-    while (walker.hasNext()) {
-
-      Node currentNode = walker.nextNode();
-      String nodeName = currentNode.getNodeName();
-      short nodeType = currentNode.getNodeType();
-
-      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
-        return false;
-      }
-
-      if (nodeType == Node.ELEMENT_NODE) {
-        if ("title".equalsIgnoreCase(nodeName)) {
-          getText(sb, currentNode);
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
-  /** If Node contains a BASE tag then it's HREF is returned. */
-  URL getBase(Node node) {
-
-    NodeWalker walker = new NodeWalker(node);
-
-    while (walker.hasNext()) {
-
-      Node currentNode = walker.nextNode();
-      String nodeName = currentNode.getNodeName();
-      short nodeType = currentNode.getNodeType();
-
-      // is this node a BASE tag?
-      if (nodeType == Node.ELEMENT_NODE) {
-
-        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
-          return null;
-        }
-
-        if ("base".equalsIgnoreCase(nodeName)) {
-          NamedNodeMap attrs = currentNode.getAttributes();
-          for (int i = 0; i < attrs.getLength(); i++) {
-            Node attr = attrs.item(i);
-            if ("href".equalsIgnoreCase(attr.getNodeName())) {
-              try {
-                return new URL(attr.getNodeValue());
-              } catch (MalformedURLException e) {
-              }
-            }
-          }
-        }
-      }
-    }
-
-    // no.
-    return null;
-  }
-
-  private boolean hasOnlyWhiteSpace(Node node) {
-    String val = node.getNodeValue();
-    for (int i = 0; i < val.length(); i++) {
-      if (!Character.isWhitespace(val.charAt(i)))
-        return false;
-    }
-    return true;
-  }
-  
-  // this only covers a few cases of empty links that are symptomatic
-  // of nekohtml's DOM-fixup process...
-  private boolean shouldThrowAwayLink(Node node, NodeList children,
-      int childLen, LinkParams params) {
-    if (childLen == 0) {
-      // this has no inner structure
-      if (params.childLen == 0)
-        return false;
-      else
-        return true;
-    } else if ((childLen == 1)
-        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
-        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
-      // single nested link
-      return true;
-
-    } else if (childLen == 2) {
-
-      Node c0 = children.item(0);
-      Node c1 = children.item(1);
-
-      if ((c0.getNodeType() == Node.ELEMENT_NODE)
-          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
-          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
-        // single link followed by whitespace node
-        return true;
-      }
-
-      if ((c1.getNodeType() == Node.ELEMENT_NODE)
-          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
-          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
-        // whitespace node followed by single link
-        return true;
-      }
-
-    } else if (childLen == 3) {
-      Node c0 = children.item(0);
-      Node c1 = children.item(1);
-      Node c2 = children.item(2);
-
-      if ((c1.getNodeType() == Node.ELEMENT_NODE)
-          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
-          && (c0.getNodeType() == Node.TEXT_NODE)
-          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
-          && hasOnlyWhiteSpace(c2)) {
-        // single link surrounded by whitespace nodes
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-  /**
-   * This method finds all anchors below the supplied DOM <code>node</code>, 
and
-   * creates appropriate {@link Outlink} records for each (relative to the
-   * supplied <code>base</code> URL), and adds them to the 
<code>outlinks</code>
-   * {@link ArrayList}.
-   * 
-   * <p>
-   * 
-   * Links without inner structure (tags, text, etc) are discarded, as are 
links
-   * which contain only single nested links and empty text nodes (this is a
-   * common DOM-fixup artifact, at least with nekohtml).
-   */
-  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
-
-    NodeWalker walker = new NodeWalker(node);
-    while (walker.hasNext()) {
-
-      Node currentNode = walker.nextNode();
-      String nodeName = currentNode.getNodeName();
-      short nodeType = currentNode.getNodeType();
-      NodeList children = currentNode.getChildNodes();
-      int childLen = (children != null) ? children.getLength() : 0;
-
-      if (nodeType == Node.ELEMENT_NODE) {
-
-        nodeName = nodeName.toLowerCase();
-        LinkParams params = (LinkParams) linkParams.get(nodeName);
-        if (params != null) {
-          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
-
-            StringBuffer linkText = new StringBuffer();
-            getText(linkText, currentNode, true);
-
-            NamedNodeMap attrs = currentNode.getAttributes();
-            String target = null;
-            boolean noFollow = false;
-            boolean post = false;
-            for (int i = 0; i < attrs.getLength(); i++) {
-              Node attr = attrs.item(i);
-              String attrName = attr.getNodeName();
-              if (params.attrName.equalsIgnoreCase(attrName)) {
-                target = attr.getNodeValue();
-              } else if ("rel".equalsIgnoreCase(attrName)
-                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
-                noFollow = true;
-              } else if ("method".equalsIgnoreCase(attrName)
-                  && "post".equalsIgnoreCase(attr.getNodeValue())) {
-                post = true;
-              }
-            }
-            if (target != null && !noFollow && !post)
-              try {
-
-                URL url = URLUtil.resolveURL(base, target);
-                outlinks.add(new Outlink(url.toString(), linkText.toString()
-                    .trim()));
-              } catch (MalformedURLException e) {
-                // don't care
-              }
-          }
-          // this should not have any children, skip them
-          if (params.childLen == 0)
-            continue;
-        }
-      }
-    }
-  }
-  
-  // This one is used by NUTCH-1918
-  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> 
tikaExtractedOutlinks) {
-    String target = null;
-    String anchor = null;
-    boolean noFollow = false;
-
-    for (Link link : tikaExtractedOutlinks) {
-      target = link.getUri();
-      noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : 
false;
-      anchor = link.getText();
-
-      if (!ignoredTags.contains(link.getType())) {
-        if (target != null && !noFollow) {
-          try {
-            URL url = URLUtil.resolveURL(base, target);
-            
-            // clean the anchor
-            anchor = anchor.replaceAll("\\s+", " ");
-            anchor = anchor.trim();
-            
-            outlinks.add(new Outlink(url.toString(), anchor));
-          } catch (MalformedURLException e) {
-            // don't care
-          }
-        }
-      }
-    }
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
deleted file mode 100644
index 294bde9..0000000
--- 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.tika;
-
-import java.net.URL;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.w3c.dom.*;
-
-/**
- * Class for parsing META Directives from DOM trees. This class handles
- * specifically Robots META directives (all, none, nofollow, noindex), finding
- * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives 
are
- * stored in a HTMLMetaTags instance.
- */
-public class HTMLMetaProcessor {
-
-  /**
-   * Utility class with indicators for the robots directives "noindex" and
-   * "nofollow", and HTTP-EQUIV/no-cache
-   */
-
-  /**
-   * Sets the indicators in <code>robotsMeta</code> to appropriate values, 
based
-   * on any META tags found under the given <code>node</code>.
-   */
-  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
-      URL currURL) {
-
-    metaTags.reset();
-    getMetaTagsHelper(metaTags, node, currURL);
-  }
-
-  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
-      URL currURL) {
-
-    if (node.getNodeType() == Node.ELEMENT_NODE) {
-
-      if ("body".equalsIgnoreCase(node.getNodeName())) {
-        // META tags should not be under body
-        return;
-      }
-
-      if ("meta".equalsIgnoreCase(node.getNodeName())) {
-        NamedNodeMap attrs = node.getAttributes();
-        Node nameNode = null;
-        Node equivNode = null;
-        Node contentNode = null;
-        // Retrieves name, http-equiv and content attribues
-        for (int i = 0; i < attrs.getLength(); i++) {
-          Node attr = attrs.item(i);
-          String attrName = attr.getNodeName().toLowerCase();
-          if (attrName.equals("name")) {
-            nameNode = attr;
-          } else if (attrName.equals("http-equiv")) {
-            equivNode = attr;
-          } else if (attrName.equals("content")) {
-            contentNode = attr;
-          }
-        }
-
-        if (nameNode != null) {
-          if (contentNode != null) {
-            String name = nameNode.getNodeValue().toLowerCase();
-            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
-            if ("robots".equals(name)) {
-
-              if (contentNode != null) {
-                String directives = contentNode.getNodeValue().toLowerCase();
-                int index = directives.indexOf("none");
-
-                if (index >= 0) {
-                  metaTags.setNoIndex();
-                  metaTags.setNoFollow();
-                }
-
-                index = directives.indexOf("all");
-                if (index >= 0) {
-                  // do nothing...
-                }
-
-                index = directives.indexOf("noindex");
-                if (index >= 0) {
-                  metaTags.setNoIndex();
-                }
-
-                index = directives.indexOf("nofollow");
-                if (index >= 0) {
-                  metaTags.setNoFollow();
-                }
-
-                index = directives.indexOf("noarchive");
-                if (index >= 0) {
-                  metaTags.setNoCache();
-                }
-              }
-
-            } // end if (name == robots)
-          }
-        }
-
-        if (equivNode != null) {
-          if (contentNode != null) {
-            String name = equivNode.getNodeValue().toLowerCase();
-            String content = contentNode.getNodeValue();
-            metaTags.getHttpEquivTags().setProperty(name, content);
-            if ("pragma".equals(name)) {
-              content = content.toLowerCase();
-              int index = content.indexOf("no-cache");
-              if (index >= 0)
-                metaTags.setNoCache();
-            } else if ("refresh".equals(name)) {
-              int idx = content.indexOf(';');
-              String time = null;
-              if (idx == -1) { // just the refresh time
-                time = content;
-              } else
-                time = content.substring(0, idx);
-              try {
-                metaTags.setRefreshTime(Integer.parseInt(time));
-                // skip this if we couldn't parse the time
-                metaTags.setRefresh(true);
-              } catch (Exception e) {
-                ;
-              }
-              URL refreshUrl = null;
-              if (metaTags.getRefresh() && idx != -1) { // set the URL
-                idx = content.toLowerCase().indexOf("url=");
-                if (idx == -1) { // assume a mis-formatted entry with just the
-                                 // url
-                  idx = content.indexOf(';') + 1;
-                } else
-                  idx += 4;
-                if (idx != -1) {
-                  String url = content.substring(idx);
-                  try {
-                    refreshUrl = new URL(url);
-                  } catch (Exception e) {
-                    // XXX according to the spec, this has to be an absolute
-                    // XXX url. However, many websites use relative URLs and
-                    // XXX expect browsers to handle that.
-                    // XXX Unfortunately, in some cases this may create a
-                    // XXX infinitely recursive paths (a crawler trap)...
-                    // if (!url.startsWith("/")) url = "/" + url;
-                    try {
-                      refreshUrl = new URL(currURL, url);
-                    } catch (Exception e1) {
-                      refreshUrl = null;
-                    }
-                  }
-                }
-              }
-              if (metaTags.getRefresh()) {
-                if (refreshUrl == null) {
-                  // apparently only refresh time was present. set the URL
-                  // to the same URL.
-                  refreshUrl = currURL;
-                }
-                metaTags.setRefreshHref(refreshUrl);
-              }
-            }
-          }
-        }
-
-      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
-        NamedNodeMap attrs = node.getAttributes();
-        Node hrefNode = attrs.getNamedItem("href");
-
-        if (hrefNode != null) {
-          String urlString = hrefNode.getNodeValue();
-
-          URL url = null;
-          try {
-            if (currURL == null)
-              url = new URL(urlString);
-            else
-              url = new URL(currURL, urlString);
-          } catch (Exception e) {
-            ;
-          }
-
-          if (url != null)
-            metaTags.setBaseHref(url);
-        }
-
-      }
-
-    }
-
-    NodeList children = node.getChildNodes();
-    if (children != null) {
-      int len = children.getLength();
-      for (int i = 0; i < len; i++) {
-        getMetaTagsHelper(metaTags, children.item(i), currURL);
-      }
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
deleted file mode 100644
index 5d7eca9..0000000
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.tika;
-
-import java.io.ByteArrayInputStream;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.html.dom.HTMLDocumentImpl;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilters;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.protocol.Content;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.HtmlMapper;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.Link;
-import org.apache.tika.sax.LinkContentHandler;
-import org.apache.tika.sax.TeeContentHandler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.DocumentFragment;
-import org.xml.sax.ContentHandler;
-
-/**
- * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
- * representation returned by Tika as SAX events
- ***/
-
-public class TikaParser implements org.apache.nutch.parse.Parser {
-
-  public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);
-
-  private Configuration conf;
-  private TikaConfig tikaConfig = null;
-  private DOMContentUtils utils;
-  private HtmlParseFilters htmlParseFilters;
-  private String cachingPolicy;
-  private HtmlMapper HTMLMapper;
-  private boolean upperCaseElementNames = true;
-
-  @SuppressWarnings("deprecation")
-  public ParseResult getParse(Content content) {
-    String mimeType = content.getContentType();
-    
-    boolean useBoilerpipe = getConf().get("tika.extractor", 
"none").equals("boilerpipe");
-    String boilerpipeExtractorName = 
getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
-
-    URL base;
-    try {
-      base = new URL(content.getBaseUrl());
-    } catch (MalformedURLException e) {
-      return new ParseStatus(e)
-          .getEmptyParseResult(content.getUrl(), getConf());
-    }
-
-    // get the right parser using the mime type as a clue
-    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
-    byte[] raw = content.getContent();
-
-    if (parser == null) {
-      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
-      LOG.error(message);
-      return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
-          content.getUrl(), getConf());
-    }
-
-    LOG.debug("Using Tika parser " + parser.getClass().getName()
-        + " for mime-type " + mimeType);
-
-    Metadata tikamd = new Metadata();
-
-    HTMLDocumentImpl doc = new HTMLDocumentImpl();
-    doc.setErrorChecking(false);
-    DocumentFragment root = doc.createDocumentFragment();
-
-    ContentHandler domHandler;
-    
-    // Check whether to use Tika's BoilerplateContentHandler
-    if (useBoilerpipe) {
-      BoilerpipeContentHandler bpHandler = new 
BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
-      BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
-      bpHandler.setIncludeMarkup(true);
-      domHandler = (ContentHandler)bpHandler;
-    } else {
-      DOMBuilder domBuilder = new DOMBuilder(doc, root);
-      domBuilder.setUpperCaseElementNames(upperCaseElementNames);
-      domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
-      domHandler = (ContentHandler)domBuilder;
-    }
-
-    LinkContentHandler linkContentHandler = new LinkContentHandler();
-
-    ParseContext context = new ParseContext();
-    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, 
linkContentHandler);
-    
-    if (HTMLMapper != null)
-      context.set(HtmlMapper.class, HTMLMapper);
-    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
-    try {
-      parser.parse(new ByteArrayInputStream(raw), 
(ContentHandler)teeContentHandler, tikamd, context);
-    } catch (Exception e) {
-      LOG.error("Error parsing " + content.getUrl(), e);
-      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
-          .getEmptyParseResult(content.getUrl(), getConf());
-    }
-
-    HTMLMetaTags metaTags = new HTMLMetaTags();
-    String text = "";
-    String title = "";
-    Outlink[] outlinks = new Outlink[0];
-    org.apache.nutch.metadata.Metadata nutchMetadata = new 
org.apache.nutch.metadata.Metadata();
-
-    // we have converted the sax events generated by Tika into a DOM object
-    // so we can now use the usual HTML resources from Nutch
-    // get meta directives
-    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
-    if (LOG.isTraceEnabled()) {
-      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
-    }
-
-    // check meta directives
-    if (!metaTags.getNoIndex()) { // okay to index
-      StringBuffer sb = new StringBuffer();
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("Getting text...");
-      }
-      utils.getText(sb, root); // extract text
-      text = sb.toString();
-      sb.setLength(0);
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("Getting title...");
-      }
-      utils.getTitle(sb, root); // extract title
-      title = sb.toString().trim();
-    }
-
-    if (!metaTags.getNoFollow()) { // okay to follow links
-      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
-      URL baseTag = utils.getBase(root);
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("Getting links...");
-      }
-      
-      // pre-1233 outlink extraction
-      //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
-      // Get outlinks from Tika
-      List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
-      utils.getOutlinks(baseTag != null ? baseTag : base, l, 
tikaExtractedOutlinks);
-      outlinks = l.toArray(new Outlink[l.size()]);
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("found " + outlinks.length + " outlinks in "
-            + content.getUrl());
-      }
-    }
-
-    // populate Nutch metadata with Tika metadata
-    String[] TikaMDNames = tikamd.names();
-    for (String tikaMDName : TikaMDNames) {
-      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
-        continue;
-      String[] values = tikamd.getValues(tikaMDName);
-      for (String v : values)
-        nutchMetadata.add(tikaMDName, v);
-    }
-
-    // no outlinks? try OutlinkExtractor e.g works for mime types where no
-    // explicit markup for anchors
-
-    if (outlinks.length == 0) {
-      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
-    }
-
-    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
-    if (metaTags.getRefresh()) {
-      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
-      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
-          Integer.toString(metaTags.getRefreshTime()) });
-    }
-    ParseData parseData = new ParseData(status, title, outlinks,
-        content.getMetadata(), nutchMetadata);
-    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
-        new ParseImpl(text, parseData));
-
-    // run filters on parse
-    ParseResult filteredParse = this.htmlParseFilters.filter(content,
-        parseResult, metaTags, root);
-    if (metaTags.getNoCache()) { // not okay to cache
-      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
-        entry.getValue().getData().getParseMeta()
-            .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
-    }
-    return filteredParse;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.tikaConfig = null;
-
-    // do we want a custom Tika configuration file
-    // deprecated since Tika 0.7 which is based on
-    // a service provider based configuration
-    String customConfFile = conf.get("tika.config.file");
-    if (customConfFile != null) {
-      try {
-        // see if a Tika config file can be found in the job file
-        URL customTikaConfig = conf.getResource(customConfFile);
-        if (customTikaConfig != null)
-          tikaConfig = new TikaConfig(customTikaConfig);
-      } catch (Exception e1) {
-        String message = "Problem loading custom Tika configuration from "
-            + customConfFile;
-        LOG.error(message, e1);
-      }
-    } else {
-      try {
-        tikaConfig = new TikaConfig(this.getClass().getClassLoader());
-      } catch (Exception e2) {
-        String message = "Problem loading default Tika configuration";
-        LOG.error(message, e2);
-      }
-    }
-
-    // use a custom htmlmapper
-    String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
-    if (StringUtils.isNotBlank(htmlmapperClassName)) {
-      try {
-        Class HTMLMapperClass = Class.forName(htmlmapperClassName);
-        boolean interfaceOK = HtmlMapper.class
-            .isAssignableFrom(HTMLMapperClass);
-        if (!interfaceOK) {
-          throw new RuntimeException("Class " + htmlmapperClassName
-              + " does not implement HtmlMapper");
-        }
-        HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
-      } catch (Exception e) {
-        LOG.error("Can't generate instance for class " + htmlmapperClassName);
-        throw new RuntimeException("Can't generate instance for class "
-            + htmlmapperClassName);
-      }
-    }
-
-    this.htmlParseFilters = new HtmlParseFilters(getConf());
-    this.utils = new DOMContentUtils(conf);
-    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
-        Nutch.CACHING_FORBIDDEN_CONTENT);
-    this.upperCaseElementNames = getConf().getBoolean(
-        "tika.uppercase.element.names", true);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
deleted file mode 100644
index d625c33..0000000
--- 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0
- * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
- * XXX in order to avoid dependency on Xalan.
- */
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $
- */
-package org.apache.nutch.parse.tika;
-
-/**
- * Class used to verify whether the specified <var>ch</var> conforms to the XML
- * 1.0 definition of whitespace.
- */
-class XMLCharacterRecognizer {
-
-  /**
-   * Returns whether the specified <var>ch</var> conforms to the XML 1.0
-   * definition of whitespace. Refer to <A
-   * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S";> the definition of
-   * <CODE>S</CODE></A> for details.
-   * 
-   * @param ch
-   *          Character to check as XML whitespace.
-   * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
-   */
-  static boolean isWhiteSpace(char ch) {
-    return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
-  }
-
-  /**
-   * Tell if the string is whitespace.
-   * 
-   * @param ch
-   *          Character array to check as XML whitespace.
-   * @param start
-   *          Start index of characters in the array
-   * @param length
-   *          Number of characters in the array
-   * @return True if the characters in the array are XML whitespace; otherwise,
-   *         false.
-   */
-  static boolean isWhiteSpace(char ch[], int start, int length) {
-
-    int end = start + length;
-
-    for (int s = start; s < end; s++) {
-      if (!isWhiteSpace(ch[s]))
-        return false;
-    }
-
-    return true;
-  }
-
-  /**
-   * Tell if the string is whitespace.
-   * 
-   * @param buf
-   *          StringBuffer to check as XML whitespace.
-   * @return True if characters in buffer are XML whitespace, false otherwise
-   */
-  static boolean isWhiteSpace(StringBuffer buf) {
-
-    int n = buf.length();
-
-    for (int i = 0; i < n; i++) {
-      if (!isWhiteSpace(buf.charAt(i)))
-        return false;
-    }
-
-    return true;
-  }
-
-  /**
-   * Tell if the string is whitespace.
-   * 
-   * @param s
-   *          String to check as XML whitespace.
-   * @return True if characters in buffer are XML whitespace, false otherwise
-   */
-  static boolean isWhiteSpace(String s) {
-
-    if (null != s) {
-      int n = s.length();
-
-      for (int i = 0; i < n; i++) {
-        if (!isWhiteSpace(s.charAt(i)))
-          return false;
-      }
-    }
-
-    return true;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
deleted file mode 100644
index 19e3f47..0000000
--- 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse various document formats with help of
- * <a href="http://tika.apache.org/";>Apache Tika</a>.
- */
-package org.apache.nutch.parse.tika;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
deleted file mode 100644
index 96029a6..0000000
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
+++ /dev/null
@@ -1,337 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.tika.DOMContentUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
-
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-import org.cyberneko.html.parsers.DOMFragmentParser;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Unit tests for DOMContentUtils.
- */
-public class TestDOMContentUtils {
-
-  private static final String[] testPages = {
-
-      new String("<html><head><title> title </title><script> script </script>"
-          + "</head><body> body <a href=\"http://www.nutch.org\";>"
-          + " anchor </a><!--comment-->" + "</body></html>"),
-
-      new String("<html><head><title> title </title><script> script </script>"
-          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
-          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
-          + "</body></html>"),
-
-      new String("<html><head><title> </title>" + "</head><body> "
-          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
-          + "</a></a>" + "</body></html>"),
-
-      // this one relies on certain neko fixup behavior, possibly
-      // distributing the anchors into the LI's-but not the other
-      // anchors (outside of them, instead)! So you get a tree that
-      // looks like:
-      // ... <li> <a href=/> home </a> </li>
-      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
-      new String("<html><head><title> my title </title>"
-          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
-          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
-          + "</body></html>"),
-
-      // test frameset link extraction. The invalid frame in the middle
-      // will be
-      // fixed to a third standalone frame.
-      new String("<html><head><title> my title </title>"
-          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
-          + "</frame>" + "<frameset cols=\"20,*\">"
-          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
-          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
-          + "</frameset>" + "</frameset>" + "</body></html>"),
-
-      // test <area> and <iframe> link extraction + url normalization
-      new String(
-          "<html><head><title> my title </title>"
-              + "</head><body>"
-              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
-              + "<map name=\"green\">"
-              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
-              + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
-              + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
-              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
-              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
-
-      // test whitespace processing for plain text extraction
-      new String(
-          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
-              + " </head>\n"
-              + " <body>\n"
-              + "    <h1> Whitespace\ttest  </h1> \n"
-              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  
\t\n"
-              + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
-              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
-              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
-              + "<table>"
-              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
-              + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
-              + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-              + "</table>put some text here<Br>and there."
-              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
-              + "         .        .        .         ." + "</body>  </html>"),
-
-      // test that <a rel=nofollow> links are not returned
-      new String("<html><head></head><body>"
-          + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
-          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
-          + "</body></html>"),
-      // test that POST form actions are skipped
-      new String("<html><head></head><body>"
-          + "<form method='POST' action='/search.jsp'><input type=text>"
-          + "<input type=submit><p>test1</p></form>"
-          + "<form method='GET' action='/dummy.jsp'><input type=text>"
-          + "<input type=submit><p>test2</p></form></body></html>"),
-      // test that all form actions are skipped
-      new String("<html><head></head><body>"
-          + "<form method='POST' action='/search.jsp'><input type=text>"
-          + "<input type=submit><p>test1</p></form>"
-          + "<form method='GET' action='/dummy.jsp'><input type=text>"
-          + "<input type=submit><p>test2</p></form></body></html>"),
-      new String("<html><head><title> title </title>" + "</head><body>"
-          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
-          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
-      new String("<html><head><title> title </title>" + "</head><body>"
-          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
-          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
-          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), 
};
-
-  private static int SKIP = 9;
-
-  private static String[] testBaseHrefs = { "http://www.nutch.org";,
-      "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
-      "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
-      "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
-      "http://www.nutch.org//";, "http://www.nutch.org/";,
-      "http://www.nutch.org/";, "http://www.nutch.org/";,
-      "http://www.nutch.org/;something"; };
-
-  private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
-
-  private static URL[] testBaseHrefURLs = new URL[testPages.length];
-
-  private static final String[] answerText = {
-      "title body anchor",
-      "title body home bots",
-      "separate this from this",
-      "my title body home 1 2",
-      "my title",
-      "my title the bottom",
-      "my title Whitespace test whitespace test "
-          + "This is a whitespace test . Newlines should appear as space too. "
-          + "Tabs are spaces too. This is a break -> and the line after break 
. "
-          + "one two three space here space there no space "
-          + "one two two three three four put some text here and there. "
-          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
-      "test1 test2", "title anchor1 anchor2 anchor3",
-      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
-
-  private static final String[] answerTitle = { "title", "title", "",
-      "my title", "my title", "my title", "my title", "", "", "", "title",
-      "title" };
-
-  // note: should be in page-order
-  private static Outlink[][] answerOutlinks;
-
-  private static Configuration conf;
-  private static DOMContentUtils utils = null;
-
-  @Before
-  public void setup() throws Exception {
-    conf = NutchConfiguration.create();
-    conf.setBoolean("parser.html.form.use_action", true);
-    utils = new DOMContentUtils(conf);
-    DOMFragmentParser parser = new DOMFragmentParser();
-    parser.setFeature(
-        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";,
-        true);
-    for (int i = 0; i < testPages.length; i++) {
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
-      try {
-        parser.parse(
-            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
-            node);
-        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
-      } catch (Exception e) {
-        Assert.assertTrue("caught exception: " + e, false);
-      }
-      testDOMs[i] = node;
-    }
-    answerOutlinks = new Outlink[][] {
-        { new Outlink("http://www.nutch.org";, "anchor"), },
-        { new Outlink("http://www.nutch.org/";, "home"),
-            new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
-        { new Outlink("http://www.nutch.org/";, "separate this"),
-            new Outlink("http://www.nutch.org/docs/ok";, "from this"), },
-        { new Outlink("http://www.nutch.org/";, "home"),
-            new Outlink("http://www.nutch.org/docs/1";, "1"),
-            new Outlink("http://www.nutch.org/docs/2";, "2"), },
-        { new Outlink("http://www.nutch.org/frames/top.html";, ""),
-            new Outlink("http://www.nutch.org/frames/left.html";, ""),
-            new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
-            new Outlink("http://www.nutch.org/frames/right.html";, ""), },
-        { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
-            new Outlink("http://www.nutch.org/index.html";, ""),
-            new Outlink("http://www.nutch.org/maps/#bottom";, ""),
-            new Outlink("http://www.nutch.org/bot.html";, ""),
-            new Outlink("http://www.nutch.org/docs/index.html";, ""), },
-        { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), },
-        {},
-        { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
-        {},
-        { new Outlink("http://www.nutch.org/;x";, "anchor1"),
-            new Outlink("http://www.nutch.org/g;x";, "anchor2"),
-            new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
-        {
-            // this is tricky - see RFC3986 section 5.4.1 example 7
-            new Outlink("http://www.nutch.org/g";, "anchor1"),
-            new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
-            new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
-            new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
-            new Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
-                "anchor5") } };
-
-  }
-
-  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
-    StringTokenizer st1 = new StringTokenizer(s1);
-    StringTokenizer st2 = new StringTokenizer(s2);
-
-    while (st1.hasMoreTokens()) {
-      if (!st2.hasMoreTokens())
-        return false;
-      if (!st1.nextToken().equals(st2.nextToken()))
-        return false;
-    }
-    if (st2.hasMoreTokens())
-      return false;
-    return true;
-  }
-
-  @Test
-  public void testGetText() throws Exception {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      StringBuffer sb = new StringBuffer();
-      utils.getText(sb, testDOMs[i]);
-      String text = sb.toString();
-      Assert.assertTrue(
-          "expecting text: " + answerText[i]
-              + System.getProperty("line.separator")
-              + System.getProperty("line.separator") + "got text: " + text,
-          equalsIgnoreWhitespace(answerText[i], text));
-    }
-  }
-
-  @Test
-  public void testGetTitle() throws Exception {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      StringBuffer sb = new StringBuffer();
-      utils.getTitle(sb, testDOMs[i]);
-      String text = sb.toString();
-      Assert.assertTrue(
-          "expecting text: " + answerText[i]
-              + System.getProperty("line.separator")
-              + System.getProperty("line.separator") + "got text: " + text,
-          equalsIgnoreWhitespace(answerTitle[i], text));
-    }
-  }
-
-  @Test
-  public void testGetOutlinks() throws Exception {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
-      if (i == SKIP) {
-        conf.setBoolean("parser.html.form.use_action", false);
-        utils.setConf(conf);
-      } else {
-        conf.setBoolean("parser.html.form.use_action", true);
-        utils.setConf(conf);
-      }
-      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
-      Outlink[] outlinkArr = new Outlink[outlinks.size()];
-      outlinkArr = outlinks.toArray(outlinkArr);
-      compareOutlinks(answerOutlinks[i], outlinkArr);
-    }
-  }
-
-  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
-    for (int i = 0; i < o.length; i++) {
-      sb.append(o[i].toString());
-      sb.append(System.getProperty("line.separator"));
-    }
-  }
-
-  private static final String outlinksString(Outlink[] o) {
-    StringBuffer sb = new StringBuffer();
-    appendOutlinks(sb, o);
-    return sb.toString();
-  }
-
-  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
-    if (o1.length != o2.length) {
-      Assert.assertTrue(
-          "got wrong number of outlinks (expecting " + o1.length + ", got "
-              + o2.length + ")" + System.getProperty("line.separator")
-              + "answer: " + System.getProperty("line.separator")
-              + outlinksString(o1) + System.getProperty("line.separator")
-              + "got: " + System.getProperty("line.separator")
-              + outlinksString(o2) + System.getProperty("line.separator"),
-          false);
-    }
-
-    for (int i = 0; i < o1.length; i++) {
-      if (!o1[i].equals(o2[i])) {
-        Assert.assertTrue(
-            "got wrong outlinks at position " + i
-                + System.getProperty("line.separator") + "answer: "
-                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
-                + "', anchor: '" + o1[i].getAnchor() + "'"
-                + System.getProperty("line.separator") + "got: "
-                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
-                + "', anchor: '" + o2[i].getAnchor() + "'", false);
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
deleted file mode 100644
index c9394dc..0000000
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.junit.Assert;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.tika.TikaParser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * 
- * @author mattmann / jnioche
- * 
- *         Test Suite for the RSS feeds with the {@link TikaParser}.
- * 
- */
-public class TestFeedParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  private String[] sampleFiles = { "rsstest.rss" };
-
-  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
-      .getName());
-
-  /**
-   * <p>
-   * The test method: tests out the following 2 asserts:
-   * </p>
-   * 
-   * <ul>
-   * <li>There are 3 outlinks read from the sample rss file</li>
-   * <li>The 3 outlinks read are in fact the correct outlinks from the sample
-   * file</li>
-   * </ul>
-   */
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    Configuration conf = NutchConfiguration.create();
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      // check that there are 2 outlinks:
-      // unlike the original parse-rss
-      // tika ignores the URL and description of the channel
-
-      // http://test.channel.com
-      // http://www-scf.usc.edu/~mattmann/
-      // http://www.nutch.org
-
-      ParseData theParseData = parse.getData();
-
-      Outlink[] theOutlinks = theParseData.getOutlinks();
-
-      Assert.assertTrue("There aren't 2 outlinks read!",
-          theOutlinks.length == 2);
-
-      // now check to make sure that those are the two outlinks
-      boolean hasLink1 = false, hasLink2 = false;
-
-      for (int j = 0; j < theOutlinks.length; j++) {
-        if (theOutlinks[j].getToUrl().equals(
-            "http://www-scf.usc.edu/~mattmann/";)) {
-          hasLink1 = true;
-        }
-
-        if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/";)) {
-          hasLink2 = true;
-        }
-      }
-
-      if (!hasLink1 || !hasLink2) {
-        Assert.fail("Outlinks read from sample rss file are not correct!");
-      }
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
deleted file mode 100644
index b1762e6..0000000
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Test extraction of image metadata
- */
-public class TestImageMetadata {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  private String[] sampleFiles = { "nutch_logo_tm.gif", };
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      Configuration conf = NutchConfiguration.create();
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      Assert.assertEquals("121", parse.getData().getMeta("width"));
-      Assert.assertEquals("48", parse.getData().getMeta("height"));
-    }
-  }
-
-}

Reply via email to