Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
 Thu Jan 29 05:38:59 2015
@@ -39,137 +39,126 @@ import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.Locator;
 import org.xml.sax.ext.LexicalHandler;
+
 /**
- * This class takes SAX events (in addition to some extra events
- * that SAX doesn't handle yet) and adds the result to a document
- * or document fragment.
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
  */
-class DOMBuilder
-        implements ContentHandler, LexicalHandler
-{
-    private boolean upperCaseElementNames = true;
+class DOMBuilder implements ContentHandler, LexicalHandler {
+  private boolean upperCaseElementNames = true;
 
-  /** Root document          */
+  /** Root document */
   public Document m_doc;
 
-  /** Current node           */
+  /** Current node */
   protected Node m_currentNode = null;
 
-  /** First node of document fragment or null if not a DocumentFragment     */
+  /** First node of document fragment or null if not a DocumentFragment */
   public DocumentFragment m_docFrag = null;
 
-  /** Vector of element nodes          */
+  /** Vector of element nodes */
   protected Stack<Element> m_elemStack = new Stack<Element>();
 
   /**
-   * DOMBuilder instance constructor... it will add the DOM nodes
-   * to the document fragment.
-   *
-   * @param doc Root document
-   * @param node Current node
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param node
+   *          Current node
    */
-  DOMBuilder(Document doc, Node node)
-  {
+  DOMBuilder(Document doc, Node node) {
     m_doc = doc;
     m_currentNode = node;
   }
 
   /**
-   * DOMBuilder instance constructor... it will add the DOM nodes
-   * to the document fragment.
-   *
-   * @param doc Root document
-   * @param docFrag Document fragment
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param docFrag
+   *          Document fragment
    */
-  DOMBuilder(Document doc, DocumentFragment docFrag)
-  {
+  DOMBuilder(Document doc, DocumentFragment docFrag) {
     m_doc = doc;
     m_docFrag = docFrag;
   }
 
   /**
-   * DOMBuilder instance constructor... it will add the DOM nodes
-   * to the document.
-   *
-   * @param doc Root document
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document.
+   * 
+   * @param doc
+   *          Root document
    */
-  DOMBuilder(Document doc)
-  {
+  DOMBuilder(Document doc) {
     m_doc = doc;
   }
 
   /**
-   * Get the root node of the DOM being created.  This
-   * is either a Document or a DocumentFragment.
-   *
+   * Get the root node of the DOM being created. This is either a Document or a
+   * DocumentFragment.
+   * 
    * @return The root document or document fragment if not null
    */
-  Node getRootNode()
-  {
+  Node getRootNode() {
     return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
   }
 
   /**
    * Get the node currently being processed.
-   *
+   * 
    * @return the current node being processed
    */
-  Node getCurrentNode()
-  {
+  Node getCurrentNode() {
     return m_currentNode;
   }
 
   /**
    * Return null since there is no Writer for this class.
-   *
+   * 
    * @return null
    */
-  java.io.Writer getWriter()
-  {
+  java.io.Writer getWriter() {
     return null;
   }
 
   /**
    * Append a node to the current container.
-   *
-   * @param newNode New node to append
+   * 
+   * @param newNode
+   *          New node to append
    */
-  protected void append(Node newNode) throws org.xml.sax.SAXException
-  {
+  protected void append(Node newNode) throws org.xml.sax.SAXException {
 
     Node currentNode = m_currentNode;
 
-    if (null != currentNode)
-    {
+    if (null != currentNode) {
       currentNode.appendChild(newNode);
 
       // System.out.println(newNode.getNodeName());
-    }
-    else if (null != m_docFrag)
-    {
+    } else if (null != m_docFrag) {
       m_docFrag.appendChild(newNode);
-    }
-    else
-    {
+    } else {
       boolean ok = true;
       short type = newNode.getNodeType();
 
-      if (type == Node.TEXT_NODE)
-      {
+      if (type == Node.TEXT_NODE) {
         String data = newNode.getNodeValue();
 
-        if ((null != data) && (data.trim().length() > 0))
-        {
-          throw new org.xml.sax.SAXException("Warning: can't output text 
before document element!  Ignoring...");
+        if ((null != data) && (data.trim().length() > 0)) {
+          throw new org.xml.sax.SAXException(
+              "Warning: can't output text before document element!  
Ignoring...");
         }
 
         ok = false;
-      }
-      else if (type == Node.ELEMENT_NODE)
-      {
-        if (m_doc.getDocumentElement() != null)
-        {
-          throw new org.xml.sax.SAXException("Can't have more than one root on 
a DOM!");
+      } else if (type == Node.ELEMENT_NODE) {
+        if (m_doc.getDocumentElement() != null) {
+          throw new org.xml.sax.SAXException(
+              "Can't have more than one root on a DOM!");
         }
       }
 
@@ -180,135 +169,142 @@ class DOMBuilder
 
   /**
    * Receive an object for locating the origin of SAX document events.
-   *
-   * <p>SAX parsers are strongly encouraged (though not absolutely
-   * required) to supply a locator: if it does so, it must supply
-   * the locator to the application by invoking this method before
-   * invoking any of the other methods in the ContentHandler
-   * interface.</p>
-   *
-   * <p>The locator allows the application to determine the end
-   * position of any document-related event, even if the parser is
-   * not reporting an error.  Typically, the application will
-   * use this information for reporting its own errors (such as
-   * character content that does not match an application's
-   * business rules).  The information returned by the locator
-   * is probably not sufficient for use with a search engine.</p>
-   *
-   * <p>Note that the locator will return correct information only
-   * during the invocation of the events in this interface.  The
-   * application should not attempt to use it at any other time.</p>
-   *
-   * @param locator An object that can return the location of
-   *                any SAX document event.
+   * 
+   * <p>
+   * SAX parsers are strongly encouraged (though not absolutely required) to
+   * supply a locator: if it does so, it must supply the locator to the
+   * application by invoking this method before invoking any of the other
+   * methods in the ContentHandler interface.
+   * </p>
+   * 
+   * <p>
+   * The locator allows the application to determine the end position of any
+   * document-related event, even if the parser is not reporting an error.
+   * Typically, the application will use this information for reporting its own
+   * errors (such as character content that does not match an application's
+   * business rules). The information returned by the locator is probably not
+   * sufficient for use with a search engine.
+   * </p>
+   * 
+   * <p>
+   * Note that the locator will return correct information only during the
+   * invocation of the events in this interface. The application should not
+   * attempt to use it at any other time.
+   * </p>
+   * 
+   * @param locator
+   *          An object that can return the location of any SAX document event.
    * @see org.xml.sax.Locator
    */
-  public void setDocumentLocator(Locator locator)
-  {
+  public void setDocumentLocator(Locator locator) {
 
     // No action for the moment.
   }
 
   /**
    * Receive notification of the beginning of a document.
-   *
-   * <p>The SAX parser will invoke this method only once, before any
-   * other methods in this interface or in DTDHandler (except for
-   * setDocumentLocator).</p>
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, before any other methods
+   * in this interface or in DTDHandler (except for setDocumentLocator).
+   * </p>
    */
-  public void startDocument() throws org.xml.sax.SAXException
-  {
+  public void startDocument() throws org.xml.sax.SAXException {
 
     // No action for the moment.
   }
 
   /**
    * Receive notification of the end of a document.
-   *
-   * <p>The SAX parser will invoke this method only once, and it will
-   * be the last method invoked during the parse.  The parser shall
-   * not invoke this method until it has either abandoned parsing
-   * (because of an unrecoverable error) or reached the end of
-   * input.</p>
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, and it will be the last
+   * method invoked during the parse. The parser shall not invoke this method
+   * until it has either abandoned parsing (because of an unrecoverable error)
+   * or reached the end of input.
+   * </p>
    */
-  public void endDocument() throws org.xml.sax.SAXException
-  {
+  public void endDocument() throws org.xml.sax.SAXException {
 
     // No action for the moment.
   }
 
   /**
    * Receive notification of the beginning of an element.
-   *
-   * <p>The Parser will invoke this method at the beginning of every
-   * element in the XML document; there will be a corresponding
-   * endElement() event for every startElement() event (even when the
-   * element is empty). All of the element's content will be
-   * reported, in order, before the corresponding endElement()
-   * event.</p>
-   *
-   * <p>If the element name has a namespace prefix, the prefix will
-   * still be attached.  Note that the attribute list provided will
-   * contain only attributes with explicit values (specified or
-   * defaulted): #IMPLIED attributes will be omitted.</p>
-   *
-   *
-   * @param ns The namespace of the node
-   * @param localName The local part of the qualified name
-   * @param name The element name.
-   * @param atts The attributes attached to the element, if any.
+   * 
+   * <p>
+   * The Parser will invoke this method at the beginning of every element in 
the
+   * XML document; there will be a corresponding endElement() event for every
+   * startElement() event (even when the element is empty). All of the 
element's
+   * content will be reported, in order, before the corresponding endElement()
+   * event.
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached. Note that the attribute list provided will contain only
+   * attributes with explicit values (specified or defaulted): #IMPLIED
+   * attributes will be omitted.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          The namespace of the node
+   * @param localName
+   *          The local part of the qualified name
+   * @param name
+   *          The element name.
+   * @param atts
+   *          The attributes attached to the element, if any.
    * @see #endElement
    * @see org.xml.sax.Attributes
    */
-  public void startElement(
-          String ns, String localName, String name, Attributes atts)
-            throws org.xml.sax.SAXException
-  {
+  public void startElement(String ns, String localName, String name,
+      Attributes atts) throws org.xml.sax.SAXException {
 
     Element elem;
-    
+
     if (upperCaseElementNames)
-        name = name.toUpperCase();
-    
-       // Note that the namespace-aware call must be used to correctly
-       // construct a Level 2 DOM, even for non-namespaced nodes.
+      name = name.toUpperCase();
+
+    // Note that the namespace-aware call must be used to correctly
+    // construct a Level 2 DOM, even for non-namespaced nodes.
     if ((null == ns) || (ns.length() == 0))
-      elem = m_doc.createElementNS(null,name);
+      elem = m_doc.createElementNS(null, name);
     else
       elem = m_doc.createElementNS(ns, name);
 
     append(elem);
 
-    try
-    {
+    try {
       int nAtts = atts.getLength();
 
-      if (0 != nAtts)
-      {
-        for (int i = 0; i < nAtts; i++)
-        {
+      if (0 != nAtts) {
+        for (int i = 0; i < nAtts; i++) {
 
-          //System.out.println("type " + atts.getType(i) + " name " + 
atts.getLocalName(i) );
+          // System.out.println("type " + atts.getType(i) + " name " +
+          // atts.getLocalName(i) );
           // First handle a possible ID attribute
           if (atts.getType(i).equalsIgnoreCase("ID"))
             setIDAttribute(atts.getValue(i), elem);
 
           String attrNS = atts.getURI(i);
 
-          if("".equals(attrNS))
+          if ("".equals(attrNS))
             attrNS = null; // DOM represents no-namespace as null
 
           // System.out.println("attrNS: "+attrNS+", localName: 
"+atts.getQName(i)
-          //                   +", qname: "+atts.getQName(i)+", value: 
"+atts.getValue(i));
+          // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
           // Crimson won't let us set an xmlns: attribute on the DOM.
           String attrQName = atts.getQName(i);
 
-          // In SAX, xmlns: attributes have an empty namespace, while in DOM 
they should have the xmlns namespace
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM
+          // they should have the xmlns namespace
           if (attrQName.startsWith("xmlns:"))
             attrNS = "http://www.w3.org/2000/xmlns/";;
 
           // ALWAYS use the DOM Level 2 call!
-          elem.setAttributeNS(attrNS,attrQName, atts.getValue(i));
+          elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
         }
       }
 
@@ -319,9 +315,7 @@ class DOMBuilder
       m_currentNode = elem;
 
       // append(elem);
-    }
-    catch(java.lang.Exception de)
-    {
+    } catch (java.lang.Exception de) {
       // de.printStackTrace();
       throw new org.xml.sax.SAXException(de);
     }
@@ -329,74 +323,87 @@ class DOMBuilder
   }
 
   /**
-
-
-
+   * 
+   * 
+   * 
    * Receive notification of the end of an element.
-   *
-   * <p>The SAX parser will invoke this method at the end of every
-   * element in the XML document; there will be a corresponding
-   * startElement() event for every endElement() event (even when the
-   * element is empty).</p>
-   *
-   * <p>If the element name has a namespace prefix, the prefix will
-   * still be attached to the name.</p>
-   *
-   *
-   * @param ns the namespace of the element
-   * @param localName The local part of the qualified name of the element
-   * @param name The element name
+   * 
+   * <p>
+   * The SAX parser will invoke this method at the end of every element in the
+   * XML document; there will be a corresponding startElement() event for every
+   * endElement() event (even when the element is empty).
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached to the name.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          the namespace of the element
+   * @param localName
+   *          The local part of the qualified name of the element
+   * @param name
+   *          The element name
    */
   public void endElement(String ns, String localName, String name)
-          throws org.xml.sax.SAXException
-  {
+      throws org.xml.sax.SAXException {
     m_elemStack.pop();
-    m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek();
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
   }
 
   /**
    * Set an ID string to node association in the ID table.
-   *
-   * @param id The ID string.
-   * @param elem The associated ID.
+   * 
+   * @param id
+   *          The ID string.
+   * @param elem
+   *          The associated ID.
    */
-  public void setIDAttribute(String id, Element elem)
-  {
+  public void setIDAttribute(String id, Element elem) {
 
     // Do nothing. This method is meant to be overiden.
   }
 
   /**
    * Receive notification of character data.
-   *
-   * <p>The Parser will call this method to report each chunk of
-   * character data.  SAX parsers may return all contiguous character
-   * data in a single chunk, or they may split it into several
-   * chunks; however, all of the characters in any single event
-   * must come from the same external entity, so that the Locator
-   * provides useful information.</p>
-   *
-   * <p>The application must not attempt to read from the array
-   * outside of the specified range.</p>
-   *
-   * <p>Note that some parsers will report whitespace using the
-   * ignorableWhitespace() method rather than this one (validating
-   * parsers must do so).</p>
-   *
-   * @param ch The characters from the XML document.
-   * @param start The start position in the array.
-   * @param length The number of characters to read from the array.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
    * @see #ignorableWhitespace
    * @see org.xml.sax.Locator
    */
-  public void characters(char ch[], int start, int length) throws 
org.xml.sax.SAXException
-  {
-    if(isOutsideDocElem()
-       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
-      return;  // avoid DOM006 Hierarchy request error
+  public void characters(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
 
-    if (m_inCData)
-    {
+    if (m_inCData) {
       cdata(ch, start, length);
 
       return;
@@ -404,57 +411,55 @@ class DOMBuilder
 
     String s = new String(ch, start, length);
     Node childNode;
-    childNode =  m_currentNode != null ? m_currentNode.getLastChild(): null;
-    if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){
-       ((Text)childNode).appendData(s);
-    }
-    else{
-       Text text = m_doc.createTextNode(s);
-       append(text);
+    childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+    if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+      ((Text) childNode).appendData(s);
+    } else {
+      Text text = m_doc.createTextNode(s);
+      append(text);
     }
   }
 
   /**
-   * If available, when the disable-output-escaping attribute is used,
-   * output raw text without escaping.  A PI will be inserted in front
-   * of the node with the name "lotusxsl-next-is-raw" and a value of
-   * "formatter-to-dom".
-   *
-   * @param ch Array containing the characters
-   * @param start Index to start of characters in the array
-   * @param length Number of characters in the array
+   * If available, when the disable-output-escaping attribute is used, output
+   * raw text without escaping. A PI will be inserted in front of the node with
+   * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+   * 
+   * @param ch
+   *          Array containing the characters
+   * @param start
+   *          Index to start of characters in the array
+   * @param length
+   *          Number of characters in the array
    */
   public void charactersRaw(char ch[], int start, int length)
-          throws org.xml.sax.SAXException
-  {
-    if(isOutsideDocElem()
-       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
-      return;  // avoid DOM006 Hierarchy request error
-
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
 
     String s = new String(ch, start, length);
 
     append(m_doc.createProcessingInstruction("xslt-next-is-raw",
-                                             "formatter-to-dom"));
+        "formatter-to-dom"));
     append(m_doc.createTextNode(s));
   }
 
   /**
    * Report the beginning of an entity.
-   *
-   * The start and end of the document entity are not reported.
-   * The start and end of the external DTD subset are reported
-   * using the pseudo-name "[dtd]".  All other events must be
-   * properly nested within start/end entity events.
-   *
-   * @param name The name of the entity.  If it is a parameter
-   *        entity, the name will begin with '%'.
+   * 
+   * The start and end of the document entity are not reported. The start and
+   * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+   * All other events must be properly nested within start/end entity events.
+   * 
+   * @param name
+   *          The name of the entity. If it is a parameter entity, the name 
will
+   *          begin with '%'.
    * @see #endEntity
    * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
    * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
    */
-  public void startEntity(String name) throws org.xml.sax.SAXException
-  {
+  public void startEntity(String name) throws org.xml.sax.SAXException {
 
     // Almost certainly the wrong behavior...
     // entityReference(name);
@@ -462,49 +467,58 @@ class DOMBuilder
 
   /**
    * Report the end of an entity.
-   *
-   * @param name The name of the entity that is ending.
+   * 
+   * @param name
+   *          The name of the entity that is ending.
    * @see #startEntity
    */
-  public void endEntity(String name) throws org.xml.sax.SAXException{}
+  public void endEntity(String name) throws org.xml.sax.SAXException {
+  }
 
   /**
    * Receive notivication of a entityReference.
-   *
-   * @param name name of the entity reference
+   * 
+   * @param name
+   *          name of the entity reference
    */
-  public void entityReference(String name) throws org.xml.sax.SAXException
-  {
+  public void entityReference(String name) throws org.xml.sax.SAXException {
     append(m_doc.createEntityReference(name));
   }
 
   /**
    * Receive notification of ignorable whitespace in element content.
-   *
-   * <p>Validating Parsers must use this method to report each chunk
-   * of ignorable whitespace (see the W3C XML 1.0 recommendation,
-   * section 2.10): non-validating parsers may also use this method
-   * if they are capable of parsing and using content models.</p>
-   *
-   * <p>SAX parsers may return all contiguous whitespace in a single
-   * chunk, or they may split it into several chunks; however, all of
-   * the characters in any single event must come from the same
-   * external entity, so that the Locator provides useful
-   * information.</p>
-   *
-   * <p>The application must not attempt to read from the array
-   * outside of the specified range.</p>
-   *
-   * @param ch The characters from the XML document.
-   * @param start The start position in the array.
-   * @param length The number of characters to read from the array.
+   * 
+   * <p>
+   * Validating Parsers must use this method to report each chunk of ignorable
+   * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+   * non-validating parsers may also use this method if they are capable of
+   * parsing and using content models.
+   * </p>
+   * 
+   * <p>
+   * SAX parsers may return all contiguous whitespace in a single chunk, or 
they
+   * may split it into several chunks; however, all of the characters in any
+   * single event must come from the same external entity, so that the Locator
+   * provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
    * @see #characters
    */
   public void ignorableWhitespace(char ch[], int start, int length)
-          throws org.xml.sax.SAXException
-  {
-    if(isOutsideDocElem())
-      return;  // avoid DOM006 Hierarchy request error
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem())
+      return; // avoid DOM006 Hierarchy request error
 
     String s = new String(ch, start, length);
 
@@ -513,240 +527,252 @@ class DOMBuilder
 
   /**
    * Tell if the current node is outside the document element.
-   *
+   * 
    * @return true if the current node is outside the document element.
    */
-   private boolean isOutsideDocElem()
-   {
-      return (null == m_docFrag) && m_elemStack.size() == 0 && (null == 
m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
-   }
+  private boolean isOutsideDocElem() {
+    return (null == m_docFrag)
+        && m_elemStack.size() == 0
+        && (null == m_currentNode || m_currentNode.getNodeType() == 
Node.DOCUMENT_NODE);
+  }
 
   /**
    * Receive notification of a processing instruction.
-   *
-   * <p>The Parser will invoke this method once for each processing
-   * instruction found: note that processing instructions may occur
-   * before or after the main document element.</p>
-   *
-   * <p>A SAX parser should never report an XML declaration (XML 1.0,
-   * section 2.8) or a text declaration (XML 1.0, section 4.3.1)
-   * using this method.</p>
-   *
-   * @param target The processing instruction target.
-   * @param data The processing instruction data, or null if
-   *        none was supplied.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each processing instruction
+   * found: note that processing instructions may occur before or after the 
main
+   * document element.
+   * </p>
+   * 
+   * <p>
+   * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+   * or a text declaration (XML 1.0, section 4.3.1) using this method.
+   * </p>
+   * 
+   * @param target
+   *          The processing instruction target.
+   * @param data
+   *          The processing instruction data, or null if none was supplied.
    */
   public void processingInstruction(String target, String data)
-          throws org.xml.sax.SAXException
-  {
+      throws org.xml.sax.SAXException {
     append(m_doc.createProcessingInstruction(target, data));
   }
 
   /**
    * Report an XML comment anywhere in the document.
-   *
-   * This callback will be used for comments inside or outside the
-   * document element, including comments in the external DTD
-   * subset (if read).
-   *
-   * @param ch An array holding the characters in the comment.
-   * @param start The starting position in the array.
-   * @param length The number of characters to use from the array.
+   * 
+   * This callback will be used for comments inside or outside the document
+   * element, including comments in the external DTD subset (if read).
+   * 
+   * @param ch
+   *          An array holding the characters in the comment.
+   * @param start
+   *          The starting position in the array.
+   * @param length
+   *          The number of characters to use from the array.
    */
-  public void comment(char ch[], int start, int length) throws 
org.xml.sax.SAXException
-  {
+  public void comment(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
     // tagsoup sometimes submits invalid values here
-    if (ch == null || start < 0 || length >= (ch.length - start) || length < 
0) return;
+    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+      return;
     append(m_doc.createComment(new String(ch, start, length)));
   }
 
-  /** Flag indicating that we are processing a CData section          */
+  /** Flag indicating that we are processing a CData section */
   protected boolean m_inCData = false;
 
   /**
    * Report the start of a CDATA section.
-   *
+   * 
    * @see #endCDATA
    */
-  public void startCDATA() throws org.xml.sax.SAXException
-  {
+  public void startCDATA() throws org.xml.sax.SAXException {
     m_inCData = true;
     append(m_doc.createCDATASection(""));
   }
 
   /**
    * Report the end of a CDATA section.
-   *
+   * 
    * @see #startCDATA
    */
-  public void endCDATA() throws org.xml.sax.SAXException
-  {
+  public void endCDATA() throws org.xml.sax.SAXException {
     m_inCData = false;
   }
 
   /**
    * Receive notification of cdata.
-   *
-   * <p>The Parser will call this method to report each chunk of
-   * character data.  SAX parsers may return all contiguous character
-   * data in a single chunk, or they may split it into several
-   * chunks; however, all of the characters in any single event
-   * must come from the same external entity, so that the Locator
-   * provides useful information.</p>
-   *
-   * <p>The application must not attempt to read from the array
-   * outside of the specified range.</p>
-   *
-   * <p>Note that some parsers will report whitespace using the
-   * ignorableWhitespace() method rather than this one (validating
-   * parsers must do so).</p>
-   *
-   * @param ch The characters from the XML document.
-   * @param start The start position in the array.
-   * @param length The number of characters to read from the array.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
    * @see #ignorableWhitespace
    * @see org.xml.sax.Locator
    */
-  public void cdata(char ch[], int start, int length) throws 
org.xml.sax.SAXException
-  {
-    if(isOutsideDocElem()
-       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
-      return;  // avoid DOM006 Hierarchy request error
+  public void cdata(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
 
     String s = new String(ch, start, length);
 
-    // XXX [email protected]: modified from the original, to accomodate TagSoup. 
+    // XXX [email protected]: modified from the original, to accomodate TagSoup.
     Node n = m_currentNode.getLastChild();
     if (n instanceof CDATASection)
-      ((CDATASection)n).appendData(s);
+      ((CDATASection) n).appendData(s);
     else if (n instanceof Comment)
-      ((Comment)n).appendData(s);
+      ((Comment) n).appendData(s);
   }
 
   /**
    * Report the start of DTD declarations, if any.
-   *
-   * Any declarations are assumed to be in the internal subset
-   * unless otherwise indicated.
-   *
-   * @param name The document type name.
-   * @param publicId The declared public identifier for the
-   *        external DTD subset, or null if none was declared.
-   * @param systemId The declared system identifier for the
-   *        external DTD subset, or null if none was declared.
+   * 
+   * Any declarations are assumed to be in the internal subset unless otherwise
+   * indicated.
+   * 
+   * @param name
+   *          The document type name.
+   * @param publicId
+   *          The declared public identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @param systemId
+   *          The declared system identifier for the external DTD subset, or
+   *          null if none was declared.
    * @see #endDTD
    * @see #startEntity
    */
   public void startDTD(String name, String publicId, String systemId)
-          throws org.xml.sax.SAXException
-  {
+      throws org.xml.sax.SAXException {
 
     // Do nothing for now.
   }
 
   /**
    * Report the end of DTD declarations.
-   *
+   * 
    * @see #startDTD
    */
-  public void endDTD() throws org.xml.sax.SAXException
-  {
+  public void endDTD() throws org.xml.sax.SAXException {
 
     // Do nothing for now.
   }
 
   /**
    * Begin the scope of a prefix-URI Namespace mapping.
-   *
-   * <p>The information from this event is not necessary for
-   * normal Namespace processing: the SAX XML reader will
-   * automatically replace prefixes for element and attribute
-   * names when the http://xml.org/sax/features/namespaces
-   * feature is true (the default).</p>
-   *
-   * <p>There are cases, however, when applications need to
-   * use prefixes in character data or in attribute values,
-   * where they cannot safely be expanded automatically; the
-   * start/endPrefixMapping event supplies the information
-   * to the application to expand prefixes in those contexts
-   * itself, if necessary.</p>
-   *
-   * <p>Note that start/endPrefixMapping events are not
-   * guaranteed to be properly nested relative to each-other:
-   * all startPrefixMapping events will occur before the
-   * corresponding startElement event, and all endPrefixMapping
-   * events will occur after the corresponding endElement event,
-   * but their order is not guaranteed.</p>
-   *
-   * @param prefix The Namespace prefix being declared.
-   * @param uri The Namespace URI the prefix is mapped to.
+   * 
+   * <p>
+   * The information from this event is not necessary for normal Namespace
+   * processing: the SAX XML reader will automatically replace prefixes for
+   * element and attribute names when the 
http://xml.org/sax/features/namespaces
+   * feature is true (the default).
+   * </p>
+   * 
+   * <p>
+   * There are cases, however, when applications need to use prefixes in
+   * character data or in attribute values, where they cannot safely be 
expanded
+   * automatically; the start/endPrefixMapping event supplies the information 
to
+   * the application to expand prefixes in those contexts itself, if necessary.
+   * </p>
+   * 
+   * <p>
+   * Note that start/endPrefixMapping events are not guaranteed to be properly
+   * nested relative to each-other: all startPrefixMapping events will occur
+   * before the corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event, but their 
order
+   * is not guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The Namespace prefix being declared.
+   * @param uri
+   *          The Namespace URI the prefix is mapped to.
    * @see #endPrefixMapping
    * @see #startElement
    */
   public void startPrefixMapping(String prefix, String uri)
-          throws org.xml.sax.SAXException
-  {
+      throws org.xml.sax.SAXException {
 
     /*
-    // Not sure if this is needed or wanted
-    // Also, it fails in the stree.
-    if((null != m_currentNode)
-       && (m_currentNode.getNodeType() == Node.ELEMENT_NODE))
-    {
-      String qname;
-      if(((null != prefix) && (prefix.length() == 0))
-         || (null == prefix))
-        qname = "xmlns";
-      else
-        qname = "xmlns:"+prefix;
-
-      Element elem = (Element)m_currentNode;
-      String val = elem.getAttribute(qname); // Obsolete, should be DOM2...?
-      if(val == null)
-      {
-        elem.setAttributeNS("http://www.w3.org/XML/1998/namespace";,
-                            qname, uri);
-      }
-    }
-    */
+     * // Not sure if this is needed or wanted // Also, it fails in the stree.
+     * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+     * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+     * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+     * = "xmlns:"+prefix;
+     * 
+     * Element elem = (Element)m_currentNode; String val =
+     * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == 
null)
+     * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace";, qname,
+     * uri); } }
+     */
   }
 
   /**
    * End the scope of a prefix-URI mapping.
-   *
-   * <p>See startPrefixMapping for details.  This event will
-   * always occur after the corresponding endElement event,
-   * but the order of endPrefixMapping events is not otherwise
-   * guaranteed.</p>
-   *
-   * @param prefix The prefix that was being mapping.
+   * 
+   * <p>
+   * See startPrefixMapping for details. This event will always occur after the
+   * corresponding endElement event, but the order of endPrefixMapping events 
is
+   * not otherwise guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The prefix that was being mapping.
    * @see #startPrefixMapping
    * @see #endElement
    */
-  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{}
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+  }
 
   /**
    * Receive notification of a skipped entity.
-   *
-   * <p>The Parser will invoke this method once for each entity
-   * skipped.  Non-validating processors may skip entities if they
-   * have not seen the declarations (because, for example, the
-   * entity was declared in an external DTD subset).  All processors
-   * may skip external entities, depending on the values of the
-   * http://xml.org/sax/features/external-general-entities and the
-   * http://xml.org/sax/features/external-parameter-entities
-   * properties.</p>
-   *
-   * @param name The name of the skipped entity.  If it is a
-   *        parameter entity, the name will begin with '%'.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each entity skipped.
+   * Non-validating processors may skip entities if they have not seen the
+   * declarations (because, for example, the entity was declared in an external
+   * DTD subset). All processors may skip external entities, depending on the
+   * values of the http://xml.org/sax/features/external-general-entities and 
the
+   * http://xml.org/sax/features/external-parameter-entities properties.
+   * </p>
+   * 
+   * @param name
+   *          The name of the skipped entity. If it is a parameter entity, the
+   *          name will begin with '%'.
    */
-  public void skippedEntity(String name) throws org.xml.sax.SAXException{}
-  
+  public void skippedEntity(String name) throws org.xml.sax.SAXException {
+  }
+
   public boolean isUpperCaseElementNames() {
-      return upperCaseElementNames;
+    return upperCaseElementNames;
   }
 
   public void setUpperCaseElementNames(boolean upperCaseElementNames) {
-      this.upperCaseElementNames = upperCaseElementNames;
+    this.upperCaseElementNames = upperCaseElementNames;
   }
 }

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
 Thu Jan 29 05:38:59 2015
@@ -34,35 +34,35 @@ import org.w3c.dom.NodeList;
 /**
  * A collection of methods for extracting content from DOM trees.
  * 
- * This class holds a few utility methods for pulling content out of 
- * DOM nodes, such as getOutlinks, getText, etc.
- *
+ * This class holds a few utility methods for pulling content out of DOM nodes,
+ * such as getOutlinks, getText, etc.
+ * 
  */
 public class DOMContentUtils {
 
   private static class LinkParams {
-       private String elName;
-       private String attrName;
-       private int childLen;
-      
-       private LinkParams(String elName, String attrName, int childLen) {
-          this.elName = elName;
-          this.attrName = attrName;
-          this.childLen = childLen;
-      }
-      
-       public String toString() {
-          return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen 
+ "]";
-      }
+    private String elName;
+    private String attrName;
+    private int childLen;
+
+    private LinkParams(String elName, String attrName, int childLen) {
+      this.elName = elName;
+      this.attrName = attrName;
+      this.childLen = childLen;
+    }
+
+    public String toString() {
+      return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + 
"]";
+    }
   }
-  
-  private HashMap<String,LinkParams> linkParams = new 
HashMap<String,LinkParams>();
+
+  private HashMap<String, LinkParams> linkParams = new HashMap<String, 
LinkParams>();
   private Configuration conf;
-  
+
   public DOMContentUtils(Configuration conf) {
     setConf(conf);
   }
-  
+
   public void setConf(Configuration conf) {
     // forceTags is used to override configurable tag ignoring, later on
     Collection<String> forceTags = new ArrayList<String>(1);
@@ -84,59 +84,57 @@ public class DOMContentUtils {
 
     // remove unwanted link tags from the linkParams map
     String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
-    for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) {
-      if ( ! forceTags.contains(ignoreTags[i]) )
+    for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+      if (!forceTags.contains(ignoreTags[i]))
         linkParams.remove(ignoreTags[i]);
     }
   }
-  
+
   /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node},
-   * and will append all the content text found beneath the DOM node to 
-   * the <code>StringBuffer</code>.
-   *
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append all the content text found beneath the DOM node to the
+   * <code>StringBuffer</code>.
+   * 
    * <p>
-   *
-   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
-   * be aborted and the <code>StringBuffer</code> will not contain
-   * any text encountered after a nested anchor is found.
+   * 
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be 
aborted
+   * and the <code>StringBuffer</code> will not contain any text encountered
+   * after a nested anchor is found.
    * 
    * <p>
-   *
+   * 
    * @return true if nested anchors were found
    */
-  private boolean getText(StringBuffer sb, Node node, 
-                                      boolean abortOnNestedAnchors) {
+  private boolean getText(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors) {
     if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
       return true;
-    } 
+    }
     return false;
   }
 
-
   /**
-   * This is a convinience method, equivalent to {@link
-   * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * This is a convinience method, equivalent to
+   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
    * 
    */
   public void getText(StringBuffer sb, Node node) {
     getText(sb, node, false);
   }
 
-  // returns true if abortOnNestedAnchors is true and we find nested 
+  // returns true if abortOnNestedAnchors is true and we find nested
   // anchors
-  private boolean getTextHelper(StringBuffer sb, Node node, 
-                                             boolean abortOnNestedAnchors,
-                                             int anchorDepth) {
+  private boolean getTextHelper(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors, int anchorDepth) {
     boolean abort = false;
     NodeWalker walker = new NodeWalker(node);
-    
+
     while (walker.hasNext()) {
-    
+
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
-      
+
       if ("script".equalsIgnoreCase(nodeName)) {
         walker.skipChildren();
       }
@@ -148,7 +146,7 @@ public class DOMContentUtils {
         if (anchorDepth > 1) {
           abort = true;
           break;
-        }        
+        }
       }
       if (nodeType == Node.COMMENT_NODE) {
         walker.skipChildren();
@@ -159,44 +157,45 @@ public class DOMContentUtils {
         text = text.replaceAll("\\s+", " ");
         text = text.trim();
         if (text.length() > 0) {
-          if (sb.length() > 0) sb.append(' ');
-               sb.append(text);
+          if (sb.length() > 0)
+            sb.append(' ');
+          sb.append(text);
         }
       }
     }
-    
+
     return abort;
   }
 
   /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node},
-   * and will append the content text found beneath the first
-   * <code>title</code> node to the <code>StringBuffer</code>.
-   *
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append the content text found beneath the first <code>title</code> node to
+   * the <code>StringBuffer</code>.
+   * 
    * @return true if a title node was found, false otherwise
    */
   public boolean getTitle(StringBuffer sb, Node node) {
-    
+
     NodeWalker walker = new NodeWalker(node);
-    
+
     while (walker.hasNext()) {
-  
+
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
-      
+
       if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
         return false;
       }
-  
+
       if (nodeType == Node.ELEMENT_NODE) {
         if ("title".equalsIgnoreCase(nodeName)) {
           getText(sb, currentNode);
           return true;
         }
       }
-    }      
-    
+    }
+
     return false;
   }
 
@@ -204,28 +203,29 @@ public class DOMContentUtils {
   URL getBase(Node node) {
 
     NodeWalker walker = new NodeWalker(node);
-    
+
     while (walker.hasNext()) {
-  
+
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
-      
+
       // is this node a BASE tag?
       if (nodeType == Node.ELEMENT_NODE) {
-  
+
         if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
           return null;
         }
-  
+
         if ("base".equalsIgnoreCase(nodeName)) {
           NamedNodeMap attrs = currentNode.getAttributes();
-          for (int i= 0; i < attrs.getLength(); i++ ) {
+          for (int i = 0; i < attrs.getLength(); i++) {
             Node attr = attrs.item(i);
             if ("href".equalsIgnoreCase(attr.getNodeName())) {
               try {
                 return new URL(attr.getNodeValue());
-              } catch (MalformedURLException e) {}
+              } catch (MalformedURLException e) {
+              }
             }
           }
         }
@@ -236,10 +236,9 @@ public class DOMContentUtils {
     return null;
   }
 
-
   private boolean hasOnlyWhiteSpace(Node node) {
-    String val= node.getNodeValue();
-    for (int i= 0; i < val.length(); i++) {
+    String val = node.getNodeValue();
+    for (int i = 0; i < val.length(); i++) {
       if (!Character.isWhitespace(val.charAt(i)))
         return false;
     }
@@ -248,50 +247,49 @@ public class DOMContentUtils {
 
   // this only covers a few cases of empty links that are symptomatic
   // of nekohtml's DOM-fixup process...
-  private boolean shouldThrowAwayLink(Node node, NodeList children, 
-                                              int childLen, LinkParams params) 
{
+  private boolean shouldThrowAwayLink(Node node, NodeList children,
+      int childLen, LinkParams params) {
     if (childLen == 0) {
-      // this has no inner structure 
-      if (params.childLen == 0) return false;
-      else return true;
-    } else if ((childLen == 1) 
-               && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
-               && 
(params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { 
+      // this has no inner structure
+      if (params.childLen == 0)
+        return false;
+      else
+        return true;
+    } else if ((childLen == 1)
+        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
       // single nested link
       return true;
 
     } else if (childLen == 2) {
 
-      Node c0= children.item(0);
-      Node c1= children.item(1);
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
 
       if ((c0.getNodeType() == Node.ELEMENT_NODE)
           && (params.elName.equalsIgnoreCase(c0.getNodeName()))
-          && (c1.getNodeType() == Node.TEXT_NODE) 
-          && hasOnlyWhiteSpace(c1) ) {
+          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
         // single link followed by whitespace node
         return true;
       }
 
       if ((c1.getNodeType() == Node.ELEMENT_NODE)
           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
-          && (c0.getNodeType() == Node.TEXT_NODE) 
-          && hasOnlyWhiteSpace(c0) ) {
+          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
         // whitespace node followed by single link
         return true;
       }
 
     } else if (childLen == 3) {
-      Node c0= children.item(0);
-      Node c1= children.item(1);
-      Node c2= children.item(2);
-      
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+      Node c2 = children.item(2);
+
       if ((c1.getNodeType() == Node.ELEMENT_NODE)
           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
-          && (c0.getNodeType() == Node.TEXT_NODE) 
-          && (c2.getNodeType() == Node.TEXT_NODE) 
-          && hasOnlyWhiteSpace(c0)
-          && hasOnlyWhiteSpace(c2) ) {
+          && (c0.getNodeType() == Node.TEXT_NODE)
+          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2)) {
         // single link surrounded by whitespace nodes
         return true;
       }
@@ -299,76 +297,73 @@ public class DOMContentUtils {
 
     return false;
   }
-  
+
   /**
-   * This method finds all anchors below the supplied DOM
-   * <code>node</code>, and creates appropriate {@link Outlink}
-   * records for each (relative to the supplied <code>base</code>
-   * URL), and adds them to the <code>outlinks</code> {@link
-   * ArrayList}.
-   *
+   * This method finds all anchors below the supplied DOM <code>node</code>, 
and
+   * creates appropriate {@link Outlink} records for each (relative to the
+   * supplied <code>base</code> URL), and adds them to the 
<code>outlinks</code>
+   * {@link ArrayList}.
+   * 
    * <p>
-   *
-   * Links without inner structure (tags, text, etc) are discarded, as
-   * are links which contain only single nested links and empty text
-   * nodes (this is a common DOM-fixup artifact, at least with
-   * nekohtml).
+   * 
+   * Links without inner structure (tags, text, etc) are discarded, as are 
links
+   * which contain only single nested links and empty text nodes (this is a
+   * common DOM-fixup artifact, at least with nekohtml).
    */
-  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, 
-                                       Node node) {
-    
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
+
     NodeWalker walker = new NodeWalker(node);
     while (walker.hasNext()) {
-      
+
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
-      short nodeType = currentNode.getNodeType();      
+      short nodeType = currentNode.getNodeType();
       NodeList children = currentNode.getChildNodes();
-      int childLen = (children != null) ? children.getLength() : 0; 
-      
+      int childLen = (children != null) ? children.getLength() : 0;
+
       if (nodeType == Node.ELEMENT_NODE) {
-        
+
         nodeName = nodeName.toLowerCase();
-        LinkParams params = (LinkParams)linkParams.get(nodeName);
+        LinkParams params = (LinkParams) linkParams.get(nodeName);
         if (params != null) {
           if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
-  
+
             StringBuffer linkText = new StringBuffer();
             getText(linkText, currentNode, true);
-  
+
             NamedNodeMap attrs = currentNode.getAttributes();
             String target = null;
             boolean noFollow = false;
             boolean post = false;
-            for (int i= 0; i < attrs.getLength(); i++ ) {
+            for (int i = 0; i < attrs.getLength(); i++) {
               Node attr = attrs.item(i);
               String attrName = attr.getNodeName();
               if (params.attrName.equalsIgnoreCase(attrName)) {
                 target = attr.getNodeValue();
-              } else if ("rel".equalsIgnoreCase(attrName) &&
-                         "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+              } else if ("rel".equalsIgnoreCase(attrName)
+                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                 noFollow = true;
-              } else if ("method".equalsIgnoreCase(attrName) &&
-                         "post".equalsIgnoreCase(attr.getNodeValue())) {
+              } else if ("method".equalsIgnoreCase(attrName)
+                  && "post".equalsIgnoreCase(attr.getNodeValue())) {
                 post = true;
               }
             }
             if (target != null && !noFollow && !post)
               try {
-                
+
                 URL url = URLUtil.resolveURL(base, target);
-                outlinks.add(new Outlink(url.toString(),
-                                         linkText.toString().trim()));
+                outlinks.add(new Outlink(url.toString(), linkText.toString()
+                    .trim()));
               } catch (MalformedURLException e) {
                 // don't care
               }
           }
           // this should not have any children, skip them
-          if (params.childLen == 0) continue;
+          if (params.childLen == 0)
+            continue;
         }
       }
     }
   }
 
 }
-

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
 Thu Jan 29 05:38:59 2015
@@ -23,33 +23,32 @@ import org.apache.nutch.parse.HTMLMetaTa
 import org.w3c.dom.*;
 
 /**
- * Class for parsing META Directives from DOM trees.  This class
- * handles specifically Robots META directives (all, none, nofollow,
- * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
- * instructions. All meta directives are stored in a HTMLMetaTags instance.
+ * Class for parsing META Directives from DOM trees. This class handles
+ * specifically Robots META directives (all, none, nofollow, noindex), finding
+ * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives 
are
+ * stored in a HTMLMetaTags instance.
  */
 public class HTMLMetaProcessor {
 
   /**
-   * Utility class with indicators for the robots directives "noindex"
-   * and "nofollow", and HTTP-EQUIV/no-cache
+   * Utility class with indicators for the robots directives "noindex" and
+   * "nofollow", and HTTP-EQUIV/no-cache
    */
-  
+
   /**
-   * Sets the indicators in <code>robotsMeta</code> to appropriate
-   * values, based on any META tags found under the given
-   * <code>node</code>.
+   * Sets the indicators in <code>robotsMeta</code> to appropriate values, 
based
+   * on any META tags found under the given <code>node</code>.
    */
-  public static final void getMetaTags (
-    HTMLMetaTags metaTags, Node node, URL currURL) {
+  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
 
     metaTags.reset();
     getMetaTagsHelper(metaTags, node, currURL);
   }
 
-  private static final void getMetaTagsHelper(
-    HTMLMetaTags metaTags, Node node, URL currURL) {
-         
+  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
     if (node.getNodeType() == Node.ELEMENT_NODE) {
 
       if ("body".equalsIgnoreCase(node.getNodeName())) {
@@ -63,7 +62,7 @@ public class HTMLMetaProcessor {
         Node equivNode = null;
         Node contentNode = null;
         // Retrieves name, http-equiv and content attribues
-        for (int i=0; i<attrs.getLength(); i++) {
+        for (int i = 0; i < attrs.getLength(); i++) {
           Node attr = attrs.item(i);
           String attrName = attr.getNodeName().toLowerCase();
           if (attrName.equals("name")) {
@@ -74,44 +73,43 @@ public class HTMLMetaProcessor {
             contentNode = attr;
           }
         }
-        
+
         if (nameNode != null) {
           if (contentNode != null) {
-            String name = nameNode.getNodeValue().toLowerCase();   
+            String name = nameNode.getNodeValue().toLowerCase();
             metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
             if ("robots".equals(name)) {
-  
+
               if (contentNode != null) {
-                String directives = 
-                  contentNode.getNodeValue().toLowerCase();
+                String directives = contentNode.getNodeValue().toLowerCase();
                 int index = directives.indexOf("none");
-  
+
                 if (index >= 0) {
                   metaTags.setNoIndex();
                   metaTags.setNoFollow();
                 }
-  
+
                 index = directives.indexOf("all");
                 if (index >= 0) {
                   // do nothing...
                 }
-  
+
                 index = directives.indexOf("noindex");
                 if (index >= 0) {
                   metaTags.setNoIndex();
                 }
-  
+
                 index = directives.indexOf("nofollow");
                 if (index >= 0) {
                   metaTags.setNoFollow();
                 }
-                
+
                 index = directives.indexOf("noarchive");
                 if (index >= 0) {
                   metaTags.setNoCache();
                 }
-              } 
-  
+              }
+
             } // end if (name == robots)
           }
         }
@@ -124,14 +122,15 @@ public class HTMLMetaProcessor {
             if ("pragma".equals(name)) {
               content = content.toLowerCase();
               int index = content.indexOf("no-cache");
-              if (index >= 0) 
+              if (index >= 0)
                 metaTags.setNoCache();
             } else if ("refresh".equals(name)) {
               int idx = content.indexOf(';');
               String time = null;
               if (idx == -1) { // just the refresh time
                 time = content;
-              } else time = content.substring(0, idx);
+              } else
+                time = content.substring(0, idx);
               try {
                 metaTags.setRefreshTime(Integer.parseInt(time));
                 // skip this if we couldn't parse the time
@@ -142,9 +141,11 @@ public class HTMLMetaProcessor {
               URL refreshUrl = null;
               if (metaTags.getRefresh() && idx != -1) { // set the URL
                 idx = content.toLowerCase().indexOf("url=");
-                if (idx == -1) { // assume a mis-formatted entry with just the 
url
+                if (idx == -1) { // assume a mis-formatted entry with just the
+                                 // url
                   idx = content.indexOf(';') + 1;
-                } else idx += 4;
+                } else
+                  idx += 4;
                 if (idx != -1) {
                   String url = content.substring(idx);
                   try {
@@ -187,13 +188,13 @@ public class HTMLMetaProcessor {
           try {
             if (currURL == null)
               url = new URL(urlString);
-            else 
+            else
               url = new URL(currURL, urlString);
           } catch (Exception e) {
             ;
           }
 
-          if (url != null) 
+          if (url != null)
             metaTags.setBaseHref(url);
         }
 

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 Thu Jan 29 05:38:59 2015
@@ -53,203 +53,200 @@ import org.w3c.dom.DocumentFragment;
 
 public class TikaParser implements org.apache.nutch.parse.Parser {
 
-       public static final Logger LOG = 
LoggerFactory.getLogger(TikaParser.class);
+  public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);
 
-       private Configuration conf;
-       private TikaConfig tikaConfig = null;
-       private DOMContentUtils utils;
-       private HtmlParseFilters htmlParseFilters;
-       private String cachingPolicy;
-       private HtmlMapper HTMLMapper;
-       private boolean upperCaseElementNames = true;
-
-       @SuppressWarnings("deprecation")
-       public ParseResult getParse(Content content) {
-               String mimeType = content.getContentType();
-
-               URL base;
-               try {
-                       base = new URL(content.getBaseUrl());
-               } catch (MalformedURLException e) {
-                       return new 
ParseStatus(e).getEmptyParseResult(content.getUrl(),
-                                       getConf());
-               }
-
-               // get the right parser using the mime type as a clue
-               Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
-               byte[] raw = content.getContent();
-
-               if (parser == null) {
-                       String message = "Can't retrieve Tika parser for 
mime-type "
-                                       + mimeType;
-                       LOG.error(message);
-                       return new ParseStatus(ParseStatus.FAILED, message)
-                                       .getEmptyParseResult(content.getUrl(), 
getConf());
-               }
-
-               LOG.debug("Using Tika parser " + parser.getClass().getName()
-                               + " for mime-type " + mimeType);
-
-               Metadata tikamd = new Metadata();
-
-               HTMLDocumentImpl doc = new HTMLDocumentImpl();
-               doc.setErrorChecking(false);
-               DocumentFragment root = doc.createDocumentFragment();
-               DOMBuilder domhandler = new DOMBuilder(doc, root);
-               domhandler.setUpperCaseElementNames(upperCaseElementNames);
-               ParseContext context = new ParseContext();
-               if (HTMLMapper != null)
-                       context.set(HtmlMapper.class, HTMLMapper);
-               tikamd.set(Metadata.CONTENT_TYPE, mimeType);
-               try {
-                       parser.parse(new ByteArrayInputStream(raw), domhandler, 
tikamd,
-                                       context);
-               } catch (Exception e) {
-                       LOG.error("Error parsing " + content.getUrl(), e);
-                       return new ParseStatus(ParseStatus.FAILED, 
e.getMessage())
-                                       .getEmptyParseResult(content.getUrl(), 
getConf());
-               }
-
-               HTMLMetaTags metaTags = new HTMLMetaTags();
-               String text = "";
-               String title = "";
-               Outlink[] outlinks = new Outlink[0];
-               org.apache.nutch.metadata.Metadata nutchMetadata = new 
org.apache.nutch.metadata.Metadata();
-
-               // we have converted the sax events generated by Tika into a 
DOM object
-               // so we can now use the usual HTML resources from Nutch
-               // get meta directives
-               HTMLMetaProcessor.getMetaTags(metaTags, root, base);
-               if (LOG.isTraceEnabled()) {
-                       LOG.trace("Meta tags for " + base + ": " + 
metaTags.toString());
-               }
-
-               // check meta directives
-               if (!metaTags.getNoIndex()) { // okay to index
-                       StringBuffer sb = new StringBuffer();
-                       if (LOG.isTraceEnabled()) {
-                               LOG.trace("Getting text...");
-                       }
-                       utils.getText(sb, root); // extract text
-                       text = sb.toString();
-                       sb.setLength(0);
-                       if (LOG.isTraceEnabled()) {
-                               LOG.trace("Getting title...");
-                       }
-                       utils.getTitle(sb, root); // extract title
-                       title = sb.toString().trim();
-               }
-
-               if (!metaTags.getNoFollow()) { // okay to follow links
-                       ArrayList<Outlink> l = new ArrayList<Outlink>(); // 
extract outlinks
-                       URL baseTag = utils.getBase(root);
-                       if (LOG.isTraceEnabled()) {
-                               LOG.trace("Getting links...");
-                       }
-                       utils.getOutlinks(baseTag != null ? baseTag : base, l, 
root);
-                       outlinks = l.toArray(new Outlink[l.size()]);
-                       if (LOG.isTraceEnabled()) {
-                               LOG.trace("found " + outlinks.length + " 
outlinks in "
-                                               + content.getUrl());
-                       }
-               }
-
-               // populate Nutch metadata with Tika metadata
-               String[] TikaMDNames = tikamd.names();
-               for (String tikaMDName : TikaMDNames) {
-                       if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
-                               continue;
-                       // TODO what if multivalued?
-                       nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
-               }
-
-               // no outlinks? try OutlinkExtractor e.g works for mime types 
where no
-               // explicit markup for anchors
-
-               if (outlinks.length == 0) {
-                       outlinks = OutlinkExtractor.getOutlinks(text, 
getConf());
-               }
-
-               ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
-               if (metaTags.getRefresh()) {
-                       status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
-                       status.setArgs(new String[] { 
metaTags.getRefreshHref().toString(),
-                                       
Integer.toString(metaTags.getRefreshTime()) });
-               }
-               ParseData parseData = new ParseData(status, title, outlinks,
-                               content.getMetadata(), nutchMetadata);
-               ParseResult parseResult = ParseResult.createParseResult(
-                               content.getUrl(), new ParseImpl(text, 
parseData));
-
-               // run filters on parse
-               ParseResult filteredParse = 
this.htmlParseFilters.filter(content,
-                               parseResult, metaTags, root);
-               if (metaTags.getNoCache()) { // not okay to cache
-                       for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry 
: filteredParse)
-                               entry.getValue().getData().getParseMeta()
-                                               
.set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
-               }
-               return filteredParse;
-       }
-
-       public void setConf(Configuration conf) {
-               this.conf = conf;
-               this.tikaConfig = null;
-
-               // do we want a custom Tika configuration file
-               // deprecated since Tika 0.7 which is based on
-               // a service provider based configuration
-               String customConfFile = conf.get("tika.config.file");
-               if (customConfFile != null) {
-                       try {
-                               // see if a Tika config file can be found in 
the job file
-                               URL customTikaConfig = 
conf.getResource(customConfFile);
-                               if (customTikaConfig != null)
-                                       tikaConfig = new 
TikaConfig(customTikaConfig);
-                       } catch (Exception e1) {
-                               String message = "Problem loading custom Tika 
configuration from "
-                                               + customConfFile;
-                               LOG.error(message, e1);
-                       }
-               } else {
-                       try {
-                               tikaConfig = new 
TikaConfig(this.getClass().getClassLoader());
-                       } catch (Exception e2) {
-                               String message = "Problem loading default Tika 
configuration";
-                               LOG.error(message, e2);
-                       }
-               }
-
-               // use a custom htmlmapper
-               String htmlmapperClassName = 
conf.get("tika.htmlmapper.classname");
-               if (StringUtils.isNotBlank(htmlmapperClassName)) {
-                       try {
-                               Class HTMLMapperClass = 
Class.forName(htmlmapperClassName);
-                               boolean interfaceOK = HtmlMapper.class
-                                               
.isAssignableFrom(HTMLMapperClass);
-                               if (!interfaceOK) {
-                                       throw new RuntimeException("Class " + 
htmlmapperClassName
-                                                       + " does not implement 
HtmlMapper");
-                               }
-                               HTMLMapper = (HtmlMapper) 
HTMLMapperClass.newInstance();
-                       } catch (Exception e) {
-                               LOG.error("Can't generate instance for class "
-                                               + htmlmapperClassName);
-                               throw new RuntimeException("Can't generate 
instance for class "
-                                               + htmlmapperClassName);
-                       }
-               }
-
-               this.htmlParseFilters = new HtmlParseFilters(getConf());
-               this.utils = new DOMContentUtils(conf);
-               this.cachingPolicy = 
getConf().get("parser.caching.forbidden.policy",
-                               Nutch.CACHING_FORBIDDEN_CONTENT);
-               this.upperCaseElementNames = getConf().getBoolean(
-                               "tika.uppercase.element.names", true);
-       }
-
-       public Configuration getConf() {
-               return this.conf;
-       }
+  private Configuration conf;
+  private TikaConfig tikaConfig = null;
+  private DOMContentUtils utils;
+  private HtmlParseFilters htmlParseFilters;
+  private String cachingPolicy;
+  private HtmlMapper HTMLMapper;
+  private boolean upperCaseElementNames = true;
+
+  @SuppressWarnings("deprecation")
+  public ParseResult getParse(Content content) {
+    String mimeType = content.getContentType();
+
+    URL base;
+    try {
+      base = new URL(content.getBaseUrl());
+    } catch (MalformedURLException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    // get the right parser using the mime type as a clue
+    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
+    byte[] raw = content.getContent();
+
+    if (parser == null) {
+      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
+      LOG.error(message);
+      return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+
+    LOG.debug("Using Tika parser " + parser.getClass().getName()
+        + " for mime-type " + mimeType);
+
+    Metadata tikamd = new Metadata();
+
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment root = doc.createDocumentFragment();
+    DOMBuilder domhandler = new DOMBuilder(doc, root);
+    domhandler.setUpperCaseElementNames(upperCaseElementNames);
+    ParseContext context = new ParseContext();
+    if (HTMLMapper != null)
+      context.set(HtmlMapper.class, HTMLMapper);
+    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
+    try {
+      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
+    } catch (Exception e) {
+      LOG.error("Error parsing " + content.getUrl(), e);
+      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    HTMLMetaTags metaTags = new HTMLMetaTags();
+    String text = "";
+    String title = "";
+    Outlink[] outlinks = new Outlink[0];
+    org.apache.nutch.metadata.Metadata nutchMetadata = new 
org.apache.nutch.metadata.Metadata();
+
+    // we have converted the sax events generated by Tika into a DOM object
+    // so we can now use the usual HTML resources from Nutch
+    // get meta directives
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
+    }
+
+    // check meta directives
+    if (!metaTags.getNoIndex()) { // okay to index
+      StringBuffer sb = new StringBuffer();
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting text...");
+      }
+      utils.getText(sb, root); // extract text
+      text = sb.toString();
+      sb.setLength(0);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting title...");
+      }
+      utils.getTitle(sb, root); // extract title
+      title = sb.toString().trim();
+    }
+
+    if (!metaTags.getNoFollow()) { // okay to follow links
+      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+      URL baseTag = utils.getBase(root);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting links...");
+      }
+      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      outlinks = l.toArray(new Outlink[l.size()]);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("found " + outlinks.length + " outlinks in "
+            + content.getUrl());
+      }
+    }
+
+    // populate Nutch metadata with Tika metadata
+    String[] TikaMDNames = tikamd.names();
+    for (String tikaMDName : TikaMDNames) {
+      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
+        continue;
+      // TODO what if multivalued?
+      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
+    }
+
+    // no outlinks? try OutlinkExtractor e.g works for mime types where no
+    // explicit markup for anchors
+
+    if (outlinks.length == 0) {
+      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+    }
+
+    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
+          Integer.toString(metaTags.getRefreshTime()) });
+    }
+    ParseData parseData = new ParseData(status, title, outlinks,
+        content.getMetadata(), nutchMetadata);
+    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
+        new ParseImpl(text, parseData));
+
+    // run filters on parse
+    ParseResult filteredParse = this.htmlParseFilters.filter(content,
+        parseResult, metaTags, root);
+    if (metaTags.getNoCache()) { // not okay to cache
+      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
+        entry.getValue().getData().getParseMeta()
+            .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+    }
+    return filteredParse;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.tikaConfig = null;
+
+    // do we want a custom Tika configuration file
+    // deprecated since Tika 0.7 which is based on
+    // a service provider based configuration
+    String customConfFile = conf.get("tika.config.file");
+    if (customConfFile != null) {
+      try {
+        // see if a Tika config file can be found in the job file
+        URL customTikaConfig = conf.getResource(customConfFile);
+        if (customTikaConfig != null)
+          tikaConfig = new TikaConfig(customTikaConfig);
+      } catch (Exception e1) {
+        String message = "Problem loading custom Tika configuration from "
+            + customConfFile;
+        LOG.error(message, e1);
+      }
+    } else {
+      try {
+        tikaConfig = new TikaConfig(this.getClass().getClassLoader());
+      } catch (Exception e2) {
+        String message = "Problem loading default Tika configuration";
+        LOG.error(message, e2);
+      }
+    }
+
+    // use a custom htmlmapper
+    String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
+    if (StringUtils.isNotBlank(htmlmapperClassName)) {
+      try {
+        Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+        boolean interfaceOK = HtmlMapper.class
+            .isAssignableFrom(HTMLMapperClass);
+        if (!interfaceOK) {
+          throw new RuntimeException("Class " + htmlmapperClassName
+              + " does not implement HtmlMapper");
+        }
+        HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
+      } catch (Exception e) {
+        LOG.error("Can't generate instance for class " + htmlmapperClassName);
+        throw new RuntimeException("Can't generate instance for class "
+            + htmlmapperClassName);
+      }
+    }
+
+    this.htmlParseFilters = new HtmlParseFilters(getConf());
+    this.utils = new DOMContentUtils(conf);
+    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+        Nutch.CACHING_FORBIDDEN_CONTENT);
+    this.upperCaseElementNames = getConf().getBoolean(
+        "tika.uppercase.element.names", true);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
 
 }

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
 Thu Jan 29 05:38:59 2015
@@ -26,40 +26,42 @@
 package org.apache.nutch.parse.tika;
 
 /**
- * Class used to verify whether the specified <var>ch</var> 
- * conforms to the XML 1.0 definition of whitespace. 
+ * Class used to verify whether the specified <var>ch</var> conforms to the XML
+ * 1.0 definition of whitespace.
  */
-class XMLCharacterRecognizer
-{
+class XMLCharacterRecognizer {
 
   /**
-   * Returns whether the specified <var>ch</var> conforms to the XML 1.0 
definition
-   * of whitespace.  Refer to <A 
href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S";>
-   * the definition of <CODE>S</CODE></A> for details.
-   * @param ch Character to check as XML whitespace.
+   * Returns whether the specified <var>ch</var> conforms to the XML 1.0
+   * definition of whitespace. Refer to <A
+   * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S";> the definition of
+   * <CODE>S</CODE></A> for details.
+   * 
+   * @param ch
+   *          Character to check as XML whitespace.
    * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
    */
-  static boolean isWhiteSpace(char ch)
-  {
+  static boolean isWhiteSpace(char ch) {
     return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
   }
 
   /**
    * Tell if the string is whitespace.
-   *
-   * @param ch Character array to check as XML whitespace.
-   * @param start Start index of characters in the array
-   * @param length Number of characters in the array 
-   * @return True if the characters in the array are 
-   * XML whitespace; otherwise, false.
+   * 
+   * @param ch
+   *          Character array to check as XML whitespace.
+   * @param start
+   *          Start index of characters in the array
+   * @param length
+   *          Number of characters in the array
+   * @return True if the characters in the array are XML whitespace; otherwise,
+   *         false.
    */
-  static boolean isWhiteSpace(char ch[], int start, int length)
-  {
+  static boolean isWhiteSpace(char ch[], int start, int length) {
 
     int end = start + length;
 
-    for (int s = start; s < end; s++)
-    {
+    for (int s = start; s < end; s++) {
       if (!isWhiteSpace(ch[s]))
         return false;
     }
@@ -69,39 +71,36 @@ class XMLCharacterRecognizer
 
   /**
    * Tell if the string is whitespace.
-   *
-   * @param buf StringBuffer to check as XML whitespace.
+   * 
+   * @param buf
+   *          StringBuffer to check as XML whitespace.
    * @return True if characters in buffer are XML whitespace, false otherwise
    */
-  static boolean isWhiteSpace(StringBuffer buf)
-  {
+  static boolean isWhiteSpace(StringBuffer buf) {
 
     int n = buf.length();
 
-    for (int i = 0; i < n; i++)
-    {
+    for (int i = 0; i < n; i++) {
       if (!isWhiteSpace(buf.charAt(i)))
         return false;
     }
 
     return true;
   }
-  
+
   /**
    * Tell if the string is whitespace.
-   *
-   * @param s String to check as XML whitespace.
+   * 
+   * @param s
+   *          String to check as XML whitespace.
    * @return True if characters in buffer are XML whitespace, false otherwise
    */
-  static boolean isWhiteSpace(String s)
-  {
+  static boolean isWhiteSpace(String s) {
 
-    if(null != s)
-    {
+    if (null != s) {
       int n = s.length();
-  
-      for (int i = 0; i < n; i++)
-      {
+
+      for (int i = 0; i < n; i++) {
         if (!isWhiteSpace(s.charAt(i)))
           return false;
       }

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * <a href="http://tika.apache.org/";>Apache Tika</a>.
  */
 package org.apache.nutch.parse.tika;
+


Reply via email to