Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Fri Jan 9 06:34:33 2015 @@ -39,136 +39,125 @@ import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.ext.LexicalHandler; + /** - * This class takes SAX events (in addition to some extra events - * that SAX doesn't handle yet) and adds the result to a document - * or document fragment. + * This class takes SAX events (in addition to some extra events that SAX + * doesn't handle yet) and adds the result to a document or document fragment. */ -class DOMBuilder - implements ContentHandler, LexicalHandler -{ +class DOMBuilder implements ContentHandler, LexicalHandler { - /** Root document */ + /** Root document */ public Document m_doc; - /** Current node */ + /** Current node */ protected Node m_currentNode = null; - /** First node of document fragment or null if not a DocumentFragment */ + /** First node of document fragment or null if not a DocumentFragment */ public DocumentFragment m_docFrag = null; - /** Vector of element nodes */ + /** Vector of element nodes */ protected Stack<Element> m_elemStack = new Stack<Element>(); /** - * DOMBuilder instance constructor... it will add the DOM nodes - * to the document fragment. - * - * @param doc Root document - * @param node Current node + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param node + * Current node */ - DOMBuilder(Document doc, Node node) - { + DOMBuilder(Document doc, Node node) { m_doc = doc; m_currentNode = node; } /** - * DOMBuilder instance constructor... it will add the DOM nodes - * to the document fragment. - * - * @param doc Root document - * @param docFrag Document fragment + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param docFrag + * Document fragment */ - DOMBuilder(Document doc, DocumentFragment docFrag) - { + DOMBuilder(Document doc, DocumentFragment docFrag) { m_doc = doc; m_docFrag = docFrag; } /** - * DOMBuilder instance constructor... it will add the DOM nodes - * to the document. - * - * @param doc Root document + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document. + * + * @param doc + * Root document */ - DOMBuilder(Document doc) - { + DOMBuilder(Document doc) { m_doc = doc; } /** - * Get the root node of the DOM being created. This - * is either a Document or a DocumentFragment. - * + * Get the root node of the DOM being created. This is either a Document or a + * DocumentFragment. + * * @return The root document or document fragment if not null */ - Node getRootNode() - { + Node getRootNode() { return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; } /** * Get the node currently being processed. - * + * * @return the current node being processed */ - Node getCurrentNode() - { + Node getCurrentNode() { return m_currentNode; } /** * Return null since there is no Writer for this class. - * + * * @return null */ - java.io.Writer getWriter() - { + java.io.Writer getWriter() { return null; } /** * Append a node to the current container. - * - * @param newNode New node to append + * + * @param newNode + * New node to append */ - protected void append(Node newNode) throws org.xml.sax.SAXException - { + protected void append(Node newNode) throws org.xml.sax.SAXException { Node currentNode = m_currentNode; - if (null != currentNode) - { + if (null != currentNode) { currentNode.appendChild(newNode); // System.out.println(newNode.getNodeName()); - } - else if (null != m_docFrag) - { + } else if (null != m_docFrag) { m_docFrag.appendChild(newNode); - } - else - { + } else { boolean ok = true; short type = newNode.getNodeType(); - if (type == Node.TEXT_NODE) - { + if (type == Node.TEXT_NODE) { String data = newNode.getNodeValue(); - if ((null != data) && (data.trim().length() > 0)) - { - throw new org.xml.sax.SAXException("Warning: can't output text before document element! Ignoring..."); + if ((null != data) && (data.trim().length() > 0)) { + throw new org.xml.sax.SAXException( + "Warning: can't output text before document element! Ignoring..."); } ok = false; - } - else if (type == Node.ELEMENT_NODE) - { - if (m_doc.getDocumentElement() != null) - { - throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!"); + } else if (type == Node.ELEMENT_NODE) { + if (m_doc.getDocumentElement() != null) { + throw new org.xml.sax.SAXException( + "Can't have more than one root on a DOM!"); } } @@ -179,132 +168,139 @@ class DOMBuilder /** * Receive an object for locating the origin of SAX document events. - * - * <p>SAX parsers are strongly encouraged (though not absolutely - * required) to supply a locator: if it does so, it must supply - * the locator to the application by invoking this method before - * invoking any of the other methods in the ContentHandler - * interface.</p> - * - * <p>The locator allows the application to determine the end - * position of any document-related event, even if the parser is - * not reporting an error. Typically, the application will - * use this information for reporting its own errors (such as - * character content that does not match an application's - * business rules). The information returned by the locator - * is probably not sufficient for use with a search engine.</p> - * - * <p>Note that the locator will return correct information only - * during the invocation of the events in this interface. The - * application should not attempt to use it at any other time.</p> - * - * @param locator An object that can return the location of - * any SAX document event. + * + * <p> + * SAX parsers are strongly encouraged (though not absolutely required) to + * supply a locator: if it does so, it must supply the locator to the + * application by invoking this method before invoking any of the other + * methods in the ContentHandler interface. + * </p> + * + * <p> + * The locator allows the application to determine the end position of any + * document-related event, even if the parser is not reporting an error. + * Typically, the application will use this information for reporting its own + * errors (such as character content that does not match an application's + * business rules). The information returned by the locator is probably not + * sufficient for use with a search engine. + * </p> + * + * <p> + * Note that the locator will return correct information only during the + * invocation of the events in this interface. The application should not + * attempt to use it at any other time. + * </p> + * + * @param locator + * An object that can return the location of any SAX document event. * @see org.xml.sax.Locator */ - public void setDocumentLocator(Locator locator) - { + public void setDocumentLocator(Locator locator) { // No action for the moment. } /** * Receive notification of the beginning of a document. - * - * <p>The SAX parser will invoke this method only once, before any - * other methods in this interface or in DTDHandler (except for - * setDocumentLocator).</p> + * + * <p> + * The SAX parser will invoke this method only once, before any other methods + * in this interface or in DTDHandler (except for setDocumentLocator). + * </p> */ - public void startDocument() throws org.xml.sax.SAXException - { + public void startDocument() throws org.xml.sax.SAXException { // No action for the moment. } /** * Receive notification of the end of a document. - * - * <p>The SAX parser will invoke this method only once, and it will - * be the last method invoked during the parse. The parser shall - * not invoke this method until it has either abandoned parsing - * (because of an unrecoverable error) or reached the end of - * input.</p> + * + * <p> + * The SAX parser will invoke this method only once, and it will be the last + * method invoked during the parse. The parser shall not invoke this method + * until it has either abandoned parsing (because of an unrecoverable error) + * or reached the end of input. + * </p> */ - public void endDocument() throws org.xml.sax.SAXException - { + public void endDocument() throws org.xml.sax.SAXException { // No action for the moment. } /** * Receive notification of the beginning of an element. - * - * <p>The Parser will invoke this method at the beginning of every - * element in the XML document; there will be a corresponding - * endElement() event for every startElement() event (even when the - * element is empty). All of the element's content will be - * reported, in order, before the corresponding endElement() - * event.</p> - * - * <p>If the element name has a namespace prefix, the prefix will - * still be attached. Note that the attribute list provided will - * contain only attributes with explicit values (specified or - * defaulted): #IMPLIED attributes will be omitted.</p> - * - * - * @param ns The namespace of the node - * @param localName The local part of the qualified name - * @param name The element name. - * @param atts The attributes attached to the element, if any. + * + * <p> + * The Parser will invoke this method at the beginning of every element in the + * XML document; there will be a corresponding endElement() event for every + * startElement() event (even when the element is empty). All of the element's + * content will be reported, in order, before the corresponding endElement() + * event. + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached. Note that the attribute list provided will contain only + * attributes with explicit values (specified or defaulted): #IMPLIED + * attributes will be omitted. + * </p> + * + * + * @param ns + * The namespace of the node + * @param localName + * The local part of the qualified name + * @param name + * The element name. + * @param atts + * The attributes attached to the element, if any. * @see #endElement * @see org.xml.sax.Attributes */ - public void startElement( - String ns, String localName, String name, Attributes atts) - throws org.xml.sax.SAXException - { + public void startElement(String ns, String localName, String name, + Attributes atts) throws org.xml.sax.SAXException { Element elem; - // Note that the namespace-aware call must be used to correctly - // construct a Level 2 DOM, even for non-namespaced nodes. + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. if ((null == ns) || (ns.length() == 0)) - elem = m_doc.createElementNS(null,name); + elem = m_doc.createElementNS(null, name); else elem = m_doc.createElementNS(ns, name); append(elem); - try - { + try { int nAtts = atts.getLength(); - if (0 != nAtts) - { - for (int i = 0; i < nAtts; i++) - { + if (0 != nAtts) { + for (int i = 0; i < nAtts; i++) { - //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) ); + // System.out.println("type " + atts.getType(i) + " name " + + // atts.getLocalName(i) ); // First handle a possible ID attribute if (atts.getType(i).equalsIgnoreCase("ID")) setIDAttribute(atts.getValue(i), elem); String attrNS = atts.getURI(i); - if("".equals(attrNS)) + if ("".equals(attrNS)) attrNS = null; // DOM represents no-namespace as null // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) - // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); // Crimson won't let us set an xmlns: attribute on the DOM. String attrQName = atts.getQName(i); - // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace + // In SAX, xmlns: attributes have an empty namespace, while in DOM + // they should have the xmlns namespace if (attrQName.startsWith("xmlns:")) attrNS = "http://www.w3.org/2000/xmlns/"; // ALWAYS use the DOM Level 2 call! - elem.setAttributeNS(attrNS,attrQName, atts.getValue(i)); + elem.setAttributeNS(attrNS, attrQName, atts.getValue(i)); } } @@ -315,9 +311,7 @@ class DOMBuilder m_currentNode = elem; // append(elem); - } - catch(java.lang.Exception de) - { + } catch (java.lang.Exception de) { // de.printStackTrace(); throw new org.xml.sax.SAXException(de); } @@ -325,74 +319,87 @@ class DOMBuilder } /** - - - + * + * + * * Receive notification of the end of an element. - * - * <p>The SAX parser will invoke this method at the end of every - * element in the XML document; there will be a corresponding - * startElement() event for every endElement() event (even when the - * element is empty).</p> - * - * <p>If the element name has a namespace prefix, the prefix will - * still be attached to the name.</p> - * - * - * @param ns the namespace of the element - * @param localName The local part of the qualified name of the element - * @param name The element name + * + * <p> + * The SAX parser will invoke this method at the end of every element in the + * XML document; there will be a corresponding startElement() event for every + * endElement() event (even when the element is empty). + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached to the name. + * </p> + * + * + * @param ns + * the namespace of the element + * @param localName + * The local part of the qualified name of the element + * @param name + * The element name */ public void endElement(String ns, String localName, String name) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { m_elemStack.pop(); - m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek(); + m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek(); } /** * Set an ID string to node association in the ID table. - * - * @param id The ID string. - * @param elem The associated ID. + * + * @param id + * The ID string. + * @param elem + * The associated ID. */ - public void setIDAttribute(String id, Element elem) - { + public void setIDAttribute(String id, Element elem) { // Do nothing. This method is meant to be overiden. } /** * Receive notification of character data. - * - * <p>The Parser will call this method to report each chunk of - * character data. SAX parsers may return all contiguous character - * data in a single chunk, or they may split it into several - * chunks; however, all of the characters in any single event - * must come from the same external entity, so that the Locator - * provides useful information.</p> - * - * <p>The application must not attempt to read from the array - * outside of the specified range.</p> - * - * <p>Note that some parsers will report whitespace using the - * ignorableWhitespace() method rather than this one (validating - * parsers must do so).</p> - * - * @param ch The characters from the XML document. - * @param start The start position in the array. - * @param length The number of characters to read from the array. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. * @see #ignorableWhitespace * @see org.xml.sax.Locator */ - public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException - { - if(isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error + public void characters(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error - if (m_inCData) - { + if (m_inCData) { cdata(ch, start, length); return; @@ -400,57 +407,55 @@ class DOMBuilder String s = new String(ch, start, length); Node childNode; - childNode = m_currentNode != null ? m_currentNode.getLastChild(): null; - if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){ - ((Text)childNode).appendData(s); - } - else{ - Text text = m_doc.createTextNode(s); - append(text); + childNode = m_currentNode != null ? m_currentNode.getLastChild() : null; + if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) { + ((Text) childNode).appendData(s); + } else { + Text text = m_doc.createTextNode(s); + append(text); } } /** - * If available, when the disable-output-escaping attribute is used, - * output raw text without escaping. A PI will be inserted in front - * of the node with the name "lotusxsl-next-is-raw" and a value of - * "formatter-to-dom". - * - * @param ch Array containing the characters - * @param start Index to start of characters in the array - * @param length Number of characters in the array + * If available, when the disable-output-escaping attribute is used, output + * raw text without escaping. A PI will be inserted in front of the node with + * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom". + * + * @param ch + * Array containing the characters + * @param start + * Index to start of characters in the array + * @param length + * Number of characters in the array */ public void charactersRaw(char ch[], int start, int length) - throws org.xml.sax.SAXException - { - if(isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error - + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error String s = new String(ch, start, length); append(m_doc.createProcessingInstruction("xslt-next-is-raw", - "formatter-to-dom")); + "formatter-to-dom")); append(m_doc.createTextNode(s)); } /** * Report the beginning of an entity. - * - * The start and end of the document entity are not reported. - * The start and end of the external DTD subset are reported - * using the pseudo-name "[dtd]". All other events must be - * properly nested within start/end entity events. - * - * @param name The name of the entity. If it is a parameter - * entity, the name will begin with '%'. + * + * The start and end of the document entity are not reported. The start and + * end of the external DTD subset are reported using the pseudo-name "[dtd]". + * All other events must be properly nested within start/end entity events. + * + * @param name + * The name of the entity. If it is a parameter entity, the name will + * begin with '%'. * @see #endEntity * @see org.xml.sax.ext.DeclHandler#internalEntityDecl * @see org.xml.sax.ext.DeclHandler#externalEntityDecl */ - public void startEntity(String name) throws org.xml.sax.SAXException - { + public void startEntity(String name) throws org.xml.sax.SAXException { // Almost certainly the wrong behavior... // entityReference(name); @@ -458,49 +463,58 @@ class DOMBuilder /** * Report the end of an entity. - * - * @param name The name of the entity that is ending. + * + * @param name + * The name of the entity that is ending. * @see #startEntity */ - public void endEntity(String name) throws org.xml.sax.SAXException{} + public void endEntity(String name) throws org.xml.sax.SAXException { + } /** * Receive notivication of a entityReference. - * - * @param name name of the entity reference + * + * @param name + * name of the entity reference */ - public void entityReference(String name) throws org.xml.sax.SAXException - { + public void entityReference(String name) throws org.xml.sax.SAXException { append(m_doc.createEntityReference(name)); } /** * Receive notification of ignorable whitespace in element content. - * - * <p>Validating Parsers must use this method to report each chunk - * of ignorable whitespace (see the W3C XML 1.0 recommendation, - * section 2.10): non-validating parsers may also use this method - * if they are capable of parsing and using content models.</p> - * - * <p>SAX parsers may return all contiguous whitespace in a single - * chunk, or they may split it into several chunks; however, all of - * the characters in any single event must come from the same - * external entity, so that the Locator provides useful - * information.</p> - * - * <p>The application must not attempt to read from the array - * outside of the specified range.</p> - * - * @param ch The characters from the XML document. - * @param start The start position in the array. - * @param length The number of characters to read from the array. + * + * <p> + * Validating Parsers must use this method to report each chunk of ignorable + * whitespace (see the W3C XML 1.0 recommendation, section 2.10): + * non-validating parsers may also use this method if they are capable of + * parsing and using content models. + * </p> + * + * <p> + * SAX parsers may return all contiguous whitespace in a single chunk, or they + * may split it into several chunks; however, all of the characters in any + * single event must come from the same external entity, so that the Locator + * provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. * @see #characters */ public void ignorableWhitespace(char ch[], int start, int length) - throws org.xml.sax.SAXException - { - if(isOutsideDocElem()) - return; // avoid DOM006 Hierarchy request error + throws org.xml.sax.SAXException { + if (isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error String s = new String(ch, start, length); @@ -509,232 +523,244 @@ class DOMBuilder /** * Tell if the current node is outside the document element. - * + * * @return true if the current node is outside the document element. */ - private boolean isOutsideDocElem() - { - return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); - } + private boolean isOutsideDocElem() { + return (null == m_docFrag) + && m_elemStack.size() == 0 + && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } /** * Receive notification of a processing instruction. - * - * <p>The Parser will invoke this method once for each processing - * instruction found: note that processing instructions may occur - * before or after the main document element.</p> - * - * <p>A SAX parser should never report an XML declaration (XML 1.0, - * section 2.8) or a text declaration (XML 1.0, section 4.3.1) - * using this method.</p> - * - * @param target The processing instruction target. - * @param data The processing instruction data, or null if - * none was supplied. + * + * <p> + * The Parser will invoke this method once for each processing instruction + * found: note that processing instructions may occur before or after the main + * document element. + * </p> + * + * <p> + * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) + * or a text declaration (XML 1.0, section 4.3.1) using this method. + * </p> + * + * @param target + * The processing instruction target. + * @param data + * The processing instruction data, or null if none was supplied. */ public void processingInstruction(String target, String data) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { append(m_doc.createProcessingInstruction(target, data)); } /** * Report an XML comment anywhere in the document. - * - * This callback will be used for comments inside or outside the - * document element, including comments in the external DTD - * subset (if read). - * - * @param ch An array holding the characters in the comment. - * @param start The starting position in the array. - * @param length The number of characters to use from the array. + * + * This callback will be used for comments inside or outside the document + * element, including comments in the external DTD subset (if read). + * + * @param ch + * An array holding the characters in the comment. + * @param start + * The starting position in the array. + * @param length + * The number of characters to use from the array. */ - public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException - { + public void comment(char ch[], int start, int length) + throws org.xml.sax.SAXException { // tagsoup sometimes submits invalid values here - if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return; + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) + return; append(m_doc.createComment(new String(ch, start, length))); } - /** Flag indicating that we are processing a CData section */ + /** Flag indicating that we are processing a CData section */ protected boolean m_inCData = false; /** * Report the start of a CDATA section. - * + * * @see #endCDATA */ - public void startCDATA() throws org.xml.sax.SAXException - { + public void startCDATA() throws org.xml.sax.SAXException { m_inCData = true; append(m_doc.createCDATASection("")); } /** * Report the end of a CDATA section. - * + * * @see #startCDATA */ - public void endCDATA() throws org.xml.sax.SAXException - { + public void endCDATA() throws org.xml.sax.SAXException { m_inCData = false; } /** * Receive notification of cdata. - * - * <p>The Parser will call this method to report each chunk of - * character data. SAX parsers may return all contiguous character - * data in a single chunk, or they may split it into several - * chunks; however, all of the characters in any single event - * must come from the same external entity, so that the Locator - * provides useful information.</p> - * - * <p>The application must not attempt to read from the array - * outside of the specified range.</p> - * - * <p>Note that some parsers will report whitespace using the - * ignorableWhitespace() method rather than this one (validating - * parsers must do so).</p> - * - * @param ch The characters from the XML document. - * @param start The start position in the array. - * @param length The number of characters to read from the array. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. * @see #ignorableWhitespace * @see org.xml.sax.Locator */ - public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException - { - if(isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error + public void cdata(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error String s = new String(ch, start, length); - // XXX [email protected]: modified from the original, to accomodate TagSoup. + // XXX [email protected]: modified from the original, to accomodate TagSoup. Node n = m_currentNode.getLastChild(); if (n instanceof CDATASection) - ((CDATASection)n).appendData(s); + ((CDATASection) n).appendData(s); else if (n instanceof Comment) - ((Comment)n).appendData(s); + ((Comment) n).appendData(s); } /** * Report the start of DTD declarations, if any. - * - * Any declarations are assumed to be in the internal subset - * unless otherwise indicated. - * - * @param name The document type name. - * @param publicId The declared public identifier for the - * external DTD subset, or null if none was declared. - * @param systemId The declared system identifier for the - * external DTD subset, or null if none was declared. + * + * Any declarations are assumed to be in the internal subset unless otherwise + * indicated. + * + * @param name + * The document type name. + * @param publicId + * The declared public identifier for the external DTD subset, or + * null if none was declared. + * @param systemId + * The declared system identifier for the external DTD subset, or + * null if none was declared. * @see #endDTD * @see #startEntity */ public void startDTD(String name, String publicId, String systemId) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { // Do nothing for now. } /** * Report the end of DTD declarations. - * + * * @see #startDTD */ - public void endDTD() throws org.xml.sax.SAXException - { + public void endDTD() throws org.xml.sax.SAXException { // Do nothing for now. } /** * Begin the scope of a prefix-URI Namespace mapping. - * - * <p>The information from this event is not necessary for - * normal Namespace processing: the SAX XML reader will - * automatically replace prefixes for element and attribute - * names when the http://xml.org/sax/features/namespaces - * feature is true (the default).</p> - * - * <p>There are cases, however, when applications need to - * use prefixes in character data or in attribute values, - * where they cannot safely be expanded automatically; the - * start/endPrefixMapping event supplies the information - * to the application to expand prefixes in those contexts - * itself, if necessary.</p> - * - * <p>Note that start/endPrefixMapping events are not - * guaranteed to be properly nested relative to each-other: - * all startPrefixMapping events will occur before the - * corresponding startElement event, and all endPrefixMapping - * events will occur after the corresponding endElement event, - * but their order is not guaranteed.</p> - * - * @param prefix The Namespace prefix being declared. - * @param uri The Namespace URI the prefix is mapped to. + * + * <p> + * The information from this event is not necessary for normal Namespace + * processing: the SAX XML reader will automatically replace prefixes for + * element and attribute names when the http://xml.org/sax/features/namespaces + * feature is true (the default). + * </p> + * + * <p> + * There are cases, however, when applications need to use prefixes in + * character data or in attribute values, where they cannot safely be expanded + * automatically; the start/endPrefixMapping event supplies the information to + * the application to expand prefixes in those contexts itself, if necessary. + * </p> + * + * <p> + * Note that start/endPrefixMapping events are not guaranteed to be properly + * nested relative to each-other: all startPrefixMapping events will occur + * before the corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, but their order + * is not guaranteed. + * </p> + * + * @param prefix + * The Namespace prefix being declared. + * @param uri + * The Namespace URI the prefix is mapped to. * @see #endPrefixMapping * @see #startElement */ public void startPrefixMapping(String prefix, String uri) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { /* - // Not sure if this is needed or wanted - // Also, it fails in the stree. - if((null != m_currentNode) - && (m_currentNode.getNodeType() == Node.ELEMENT_NODE)) - { - String qname; - if(((null != prefix) && (prefix.length() == 0)) - || (null == prefix)) - qname = "xmlns"; - else - qname = "xmlns:"+prefix; - - Element elem = (Element)m_currentNode; - String val = elem.getAttribute(qname); // Obsolete, should be DOM2...? - if(val == null) - { - elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", - qname, uri); - } - } - */ + * // Not sure if this is needed or wanted // Also, it fails in the stree. + * if((null != m_currentNode) && (m_currentNode.getNodeType() == + * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) && + * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname + * = "xmlns:"+prefix; + * + * Element elem = (Element)m_currentNode; String val = + * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null) + * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname, + * uri); } } + */ } /** * End the scope of a prefix-URI mapping. - * - * <p>See startPrefixMapping for details. This event will - * always occur after the corresponding endElement event, - * but the order of endPrefixMapping events is not otherwise - * guaranteed.</p> - * - * @param prefix The prefix that was being mapping. + * + * <p> + * See startPrefixMapping for details. This event will always occur after the + * corresponding endElement event, but the order of endPrefixMapping events is + * not otherwise guaranteed. + * </p> + * + * @param prefix + * The prefix that was being mapping. * @see #startPrefixMapping * @see #endElement */ - public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{} + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { + } /** * Receive notification of a skipped entity. - * - * <p>The Parser will invoke this method once for each entity - * skipped. Non-validating processors may skip entities if they - * have not seen the declarations (because, for example, the - * entity was declared in an external DTD subset). All processors - * may skip external entities, depending on the values of the - * http://xml.org/sax/features/external-general-entities and the - * http://xml.org/sax/features/external-parameter-entities - * properties.</p> - * - * @param name The name of the skipped entity. If it is a - * parameter entity, the name will begin with '%'. + * + * <p> + * The Parser will invoke this method once for each entity skipped. + * Non-validating processors may skip entities if they have not seen the + * declarations (because, for example, the entity was declared in an external + * DTD subset). All processors may skip external entities, depending on the + * values of the http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities properties. + * </p> + * + * @param name + * The name of the skipped entity. If it is a parameter entity, the + * name will begin with '%'. */ - public void skippedEntity(String name) throws org.xml.sax.SAXException{} + public void skippedEntity(String name) throws org.xml.sax.SAXException { + } }
Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Fri Jan 9 06:34:33 2015 @@ -34,34 +34,34 @@ import org.w3c.dom.NodeList; /** * A collection of methods for extracting content from DOM trees. * - * This class holds a few utility methods for pulling content out of - * DOM nodes, such as getOutlinks, getText, etc. - * + * This class holds a few utility methods for pulling content out of DOM nodes, + * such as getOutlinks, getText, etc. + * */ public class DOMContentUtils { private static class LinkParams { - private String elName; - private String attrName; - private int childLen; - - private LinkParams(String elName, String attrName, int childLen) { - this.elName = elName; - this.attrName = attrName; - this.childLen = childLen; - } - - public String toString() { - return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; - } + private String elName; + private String attrName; + private int childLen; + + private LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } } - + private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); - + public DOMContentUtils(Configuration conf) { setConf(conf); } - + public void setConf(Configuration conf) { // forceTags is used to override configurable tag ignoring, later on Collection<String> forceTags = new ArrayList<String>(1); @@ -82,59 +82,57 @@ public class DOMContentUtils { // remove unwanted link tags from the linkParams map String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); - for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) { - if ( ! forceTags.contains(ignoreTags[i]) ) + for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { + if (!forceTags.contains(ignoreTags[i])) linkParams.remove(ignoreTags[i]); } } - + /** - * This method takes a {@link StringBuffer} and a DOM {@link Node}, - * and will append all the content text found beneath the DOM node to - * the <code>StringBuffer</code>. - * + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append all the content text found beneath the DOM node to the + * <code>StringBuffer</code>. + * * <p> - * - * If <code>abortOnNestedAnchors</code> is true, DOM traversal will - * be aborted and the <code>StringBuffer</code> will not contain - * any text encountered after a nested anchor is found. + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted + * and the <code>StringBuffer</code> will not contain any text encountered + * after a nested anchor is found. * * <p> - * + * * @return true if nested anchors were found */ - private boolean getText(StringBuffer sb, Node node, - boolean abortOnNestedAnchors) { + private boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors) { if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { return true; - } + } return false; } - /** - * This is a convinience method, equivalent to {@link - * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * This is a convinience method, equivalent to + * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. * */ public void getText(StringBuffer sb, Node node) { getText(sb, node, false); } - // returns true if abortOnNestedAnchors is true and we find nested + // returns true if abortOnNestedAnchors is true and we find nested // anchors - private boolean getTextHelper(StringBuffer sb, Node node, - boolean abortOnNestedAnchors, - int anchorDepth) { + private boolean getTextHelper(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, int anchorDepth) { boolean abort = false; NodeWalker walker = new NodeWalker(node); - + while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); - + if ("script".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } @@ -146,7 +144,7 @@ public class DOMContentUtils { if (anchorDepth > 1) { abort = true; break; - } + } } if (nodeType == Node.COMMENT_NODE) { walker.skipChildren(); @@ -157,44 +155,45 @@ public class DOMContentUtils { text = text.replaceAll("\\s+", " "); text = text.trim(); if (text.length() > 0) { - if (sb.length() > 0) sb.append(' '); - sb.append(text); + if (sb.length() > 0) + sb.append(' '); + sb.append(text); } } } - + return abort; } /** - * This method takes a {@link StringBuffer} and a DOM {@link Node}, - * and will append the content text found beneath the first - * <code>title</code> node to the <code>StringBuffer</code>. - * + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append the content text found beneath the first <code>title</code> node to + * the <code>StringBuffer</code>. + * * @return true if a title node was found, false otherwise */ public boolean getTitle(StringBuffer sb, Node node) { - + NodeWalker walker = new NodeWalker(node); - + while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); - + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return false; } - + if (nodeType == Node.ELEMENT_NODE) { if ("title".equalsIgnoreCase(nodeName)) { getText(sb, currentNode); return true; } } - } - + } + return false; } @@ -202,28 +201,29 @@ public class DOMContentUtils { URL getBase(Node node) { NodeWalker walker = new NodeWalker(node); - + while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); - + // is this node a BASE tag? if (nodeType == Node.ELEMENT_NODE) { - + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return null; } - + if ("base".equalsIgnoreCase(nodeName)) { NamedNodeMap attrs = currentNode.getAttributes(); - for (int i= 0; i < attrs.getLength(); i++ ) { + for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); if ("href".equalsIgnoreCase(attr.getNodeName())) { try { return new URL(attr.getNodeValue()); - } catch (MalformedURLException e) {} + } catch (MalformedURLException e) { + } } } } @@ -234,10 +234,9 @@ public class DOMContentUtils { return null; } - private boolean hasOnlyWhiteSpace(Node node) { - String val= node.getNodeValue(); - for (int i= 0; i < val.length(); i++) { + String val = node.getNodeValue(); + for (int i = 0; i < val.length(); i++) { if (!Character.isWhitespace(val.charAt(i))) return false; } @@ -246,50 +245,49 @@ public class DOMContentUtils { // this only covers a few cases of empty links that are symptomatic // of nekohtml's DOM-fixup process... - private boolean shouldThrowAwayLink(Node node, NodeList children, - int childLen, LinkParams params) { + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { if (childLen == 0) { - // this has no inner structure - if (params.childLen == 0) return false; - else return true; - } else if ((childLen == 1) - && (children.item(0).getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // this has no inner structure + if (params.childLen == 0) + return false; + else + return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { // single nested link return true; } else if (childLen == 2) { - Node c0= children.item(0); - Node c1= children.item(1); + Node c0 = children.item(0); + Node c1 = children.item(1); if ((c0.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c0.getNodeName())) - && (c1.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c1) ) { + && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { // single link followed by whitespace node return true; } if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c0) ) { + && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { // whitespace node followed by single link return true; } } else if (childLen == 3) { - Node c0= children.item(0); - Node c1= children.item(1); - Node c2= children.item(2); - + Node c0 = children.item(0); + Node c1 = children.item(1); + Node c2 = children.item(2); + if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) - && (c2.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c0) - && hasOnlyWhiteSpace(c2) ) { + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2)) { // single link surrounded by whitespace nodes return true; } @@ -297,57 +295,54 @@ public class DOMContentUtils { return false; } - + /** - * This method finds all anchors below the supplied DOM - * <code>node</code>, and creates appropriate {@link Outlink} - * records for each (relative to the supplied <code>base</code> - * URL), and adds them to the <code>outlinks</code> {@link - * ArrayList}. - * + * This method finds all anchors below the supplied DOM <code>node</code>, and + * creates appropriate {@link Outlink} records for each (relative to the + * supplied <code>base</code> URL), and adds them to the <code>outlinks</code> + * {@link ArrayList}. + * * <p> - * - * Links without inner structure (tags, text, etc) are discarded, as - * are links which contain only single nested links and empty text - * nodes (this is a common DOM-fixup artifact, at least with - * nekohtml). + * + * Links without inner structure (tags, text, etc) are discarded, as are links + * which contain only single nested links and empty text nodes (this is a + * common DOM-fixup artifact, at least with nekohtml). */ - public void getOutlinks(URL base, ArrayList<Outlink> outlinks, - Node node) { - + public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) { + NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); - short nodeType = currentNode.getNodeType(); + short nodeType = currentNode.getNodeType(); NodeList children = currentNode.getChildNodes(); - int childLen = (children != null) ? children.getLength() : 0; - + int childLen = (children != null) ? children.getLength() : 0; + if (nodeType == Node.ELEMENT_NODE) { - + nodeName = nodeName.toLowerCase(); - LinkParams params = (LinkParams)linkParams.get(nodeName); + LinkParams params = (LinkParams) linkParams.get(nodeName); if (params != null) { if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { - + StringBuffer linkText = new StringBuffer(); getText(linkText, currentNode, true); - + NamedNodeMap attrs = currentNode.getAttributes(); String target = null; boolean noFollow = false; boolean post = false; - for (int i= 0; i < attrs.getLength(); i++ ) { + for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); String attrName = attr.getNodeName(); if (params.attrName.equalsIgnoreCase(attrName)) { target = attr.getNodeValue(); - } else if ("rel".equalsIgnoreCase(attrName) && - "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + } else if ("rel".equalsIgnoreCase(attrName) + && "nofollow".equalsIgnoreCase(attr.getNodeValue())) { noFollow = true; - } else if ("method".equalsIgnoreCase(attrName) && - "post".equalsIgnoreCase(attr.getNodeValue())) { + } else if ("method".equalsIgnoreCase(attrName) + && "post".equalsIgnoreCase(attr.getNodeValue())) { post = true; } } @@ -355,18 +350,18 @@ public class DOMContentUtils { try { URL url = URLUtil.resolveURL(base, target); - outlinks.add(new Outlink(url.toString(), - linkText.toString().trim())); + outlinks.add(new Outlink(url.toString(), linkText.toString() + .trim())); } catch (MalformedURLException e) { // don't care } } // this should not have any children, skip them - if (params.childLen == 0) continue; + if (params.childLen == 0) + continue; } } } } } - Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java Fri Jan 9 06:34:33 2015 @@ -23,32 +23,31 @@ import org.apache.nutch.parse.HTMLMetaTa import org.w3c.dom.*; /** - * Class for parsing META Directives from DOM trees. This class - * handles specifically Robots META directives (all, none, nofollow, - * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache - * instructions. All meta directives are stored in a HTMLMetaTags instance. + * Class for parsing META Directives from DOM trees. This class handles + * specifically Robots META directives (all, none, nofollow, noindex), finding + * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are + * stored in a HTMLMetaTags instance. */ public class HTMLMetaProcessor { /** - * Utility class with indicators for the robots directives "noindex" - * and "nofollow", and HTTP-EQUIV/no-cache + * Utility class with indicators for the robots directives "noindex" and + * "nofollow", and HTTP-EQUIV/no-cache */ - + /** - * Sets the indicators in <code>robotsMeta</code> to appropriate - * values, based on any META tags found under the given - * <code>node</code>. + * Sets the indicators in <code>robotsMeta</code> to appropriate values, based + * on any META tags found under the given <code>node</code>. */ - public static final void getMetaTags ( - HTMLMetaTags metaTags, Node node, URL currURL) { + public static final void getMetaTags(HTMLMetaTags metaTags, Node node, + URL currURL) { metaTags.reset(); getMetaTagsHelper(metaTags, node, currURL); } - private static final void getMetaTagsHelper( - HTMLMetaTags metaTags, Node node, URL currURL) { + private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node, + URL currURL) { if (node.getNodeType() == Node.ELEMENT_NODE) { @@ -63,7 +62,7 @@ public class HTMLMetaProcessor { Node equivNode = null; Node contentNode = null; // Retrieves name, http-equiv and content attribues - for (int i=0; i<attrs.getLength(); i++) { + for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); String attrName = attr.getNodeName().toLowerCase(); if (attrName.equals("name")) { @@ -74,44 +73,43 @@ public class HTMLMetaProcessor { contentNode = attr; } } - + if (nameNode != null) { if (contentNode != null) { String name = nameNode.getNodeValue().toLowerCase(); metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); if ("robots".equals(name)) { - + if (contentNode != null) { - String directives = - contentNode.getNodeValue().toLowerCase(); + String directives = contentNode.getNodeValue().toLowerCase(); int index = directives.indexOf("none"); - + if (index >= 0) { metaTags.setNoIndex(); metaTags.setNoFollow(); } - + index = directives.indexOf("all"); if (index >= 0) { // do nothing... } - + index = directives.indexOf("noindex"); if (index >= 0) { metaTags.setNoIndex(); } - + index = directives.indexOf("nofollow"); if (index >= 0) { metaTags.setNoFollow(); } - + index = directives.indexOf("noarchive"); if (index >= 0) { metaTags.setNoCache(); } - } - + } + } // end if (name == robots) } } @@ -124,14 +122,15 @@ public class HTMLMetaProcessor { if ("pragma".equals(name)) { content = content.toLowerCase(); int index = content.indexOf("no-cache"); - if (index >= 0) + if (index >= 0) metaTags.setNoCache(); } else if ("refresh".equals(name)) { int idx = content.indexOf(';'); String time = null; if (idx == -1) { // just the refresh time time = content; - } else time = content.substring(0, idx); + } else + time = content.substring(0, idx); try { metaTags.setRefreshTime(Integer.parseInt(time)); // skip this if we couldn't parse the time @@ -142,9 +141,11 @@ public class HTMLMetaProcessor { URL refreshUrl = null; if (metaTags.getRefresh() && idx != -1) { // set the URL idx = content.toLowerCase().indexOf("url="); - if (idx == -1) { // assume a mis-formatted entry with just the url + if (idx == -1) { // assume a mis-formatted entry with just the + // url idx = content.indexOf(';') + 1; - } else idx += 4; + } else + idx += 4; if (idx != -1) { String url = content.substring(idx); try { @@ -187,13 +188,13 @@ public class HTMLMetaProcessor { try { if (currURL == null) url = new URL(urlString); - else + else url = new URL(currURL, urlString); } catch (Exception e) { ; } - if (url != null) + if (url != null) metaTags.setBaseHref(url); } Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java Fri Jan 9 06:34:33 2015 @@ -47,204 +47,195 @@ import org.xml.sax.SAXException; */ public class TikaConfig { - private final Map<String, Parser> parsers = new HashMap<String, Parser>(); + private final Map<String, Parser> parsers = new HashMap<String, Parser>(); - private final MimeTypes mimeTypes; + private final MimeTypes mimeTypes; - public TikaConfig(String file) throws TikaException, IOException, - SAXException { - this(new File(file)); - } - - public TikaConfig(File file) throws TikaException, IOException, - SAXException { - this(getBuilder().parse(file)); - } - - public TikaConfig(URL url) throws TikaException, IOException, SAXException { - this(getBuilder().parse(url.toString())); - } - - public TikaConfig(InputStream stream) throws TikaException, IOException, - SAXException { - this(getBuilder().parse(stream)); - } - - /** - * @deprecated This method will be removed in Apache Tika 1.0 - * @see <a - * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> - */ - public TikaConfig(InputStream stream, Parser delegate) - throws TikaException, IOException, SAXException { - this(stream); - } - - public TikaConfig(Document document) throws TikaException, IOException { - this(document.getDocumentElement()); - } - - /** - * @deprecated This method will be removed in Apache Tika 1.0 - * @see <a - * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> - */ - public TikaConfig(Document document, Parser delegate) throws TikaException, - IOException { - this(document); - } - - public TikaConfig(Element element) throws TikaException, IOException { - Element mtr = getChild(element, "mimeTypeRepository"); - if (mtr != null && mtr.hasAttribute("resource")) { - mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource")); - } else { - mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); - } - - NodeList nodes = element.getElementsByTagName("parser"); - for (int i = 0; i < nodes.getLength(); i++) { - Element node = (Element) nodes.item(i); - String name = node.getAttribute("class"); - - try { - Class<?> parserClass = Class.forName(name); - Object instance = parserClass.newInstance(); - if (!(instance instanceof Parser)) { - throw new TikaException( - "Configured class is not a Tika Parser: " + name); - } - Parser parser = (Parser) instance; - - NodeList mimes = node.getElementsByTagName("mime"); - if (mimes.getLength() > 0) { - for (int j = 0; j < mimes.getLength(); j++) { - parsers.put(getText(mimes.item(j)).trim(), parser); - } - } else { - ParseContext context = new ParseContext(); - for (MediaType type : parser.getSupportedTypes(context)) { - parsers.put(type.toString(), parser); - } - } - } catch (ClassNotFoundException e) { - throw new TikaException("Configured parser class not found: " - + name, e); - } catch (IllegalAccessException e) { - throw new TikaException("Unable to access a parser class: " - + name, e); - } catch (InstantiationException e) { - throw new TikaException( - "Unable to instantiate a parser class: " + name, e); - } - } - } - - public TikaConfig() throws MimeTypeException, IOException { - ParseContext context = new ParseContext(); - Iterator<Parser> iterator = ServiceRegistry.lookupProviders( - Parser.class, this.getClass().getClassLoader()); - while (iterator.hasNext()) { - Parser parser = iterator.next(); - for (MediaType type : parser.getSupportedTypes(context)) { - parsers.put(type.toString(), parser); - } - } - mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); - } - - /** - * @deprecated This method will be removed in Apache Tika 1.0 - * @see <a - * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> - */ - public TikaConfig(Element element, Parser delegate) throws TikaException, - IOException { - this(element); - } - - private String getText(Node node) { - if (node.getNodeType() == Node.TEXT_NODE) { - return node.getNodeValue(); - } else if (node.getNodeType() == Node.ELEMENT_NODE) { - StringBuilder builder = new StringBuilder(); - NodeList list = node.getChildNodes(); - for (int i = 0; i < list.getLength(); i++) { - builder.append(getText(list.item(i))); - } - return builder.toString(); - } else { - return ""; - } - } - - /** - * Returns the parser instance configured for the given MIME type. Returns - * <code>null</code> if the given MIME type is unknown. - * - * @param mimeType - * MIME type - * @return configured Parser instance, or <code>null</code> - */ - public Parser getParser(String mimeType) { - return parsers.get(mimeType); - } - - public Map<String, Parser> getParsers() { - return parsers; - } - - public MimeTypes getMimeRepository() { - return mimeTypes; - } - - /** - * Provides a default configuration (TikaConfig). Currently creates a new - * instance each time it's called; we may be able to have it return a shared - * instance once it is completely immutable. - * - * @return default configuration - */ - public static TikaConfig getDefaultConfig() { - try { - return new TikaConfig(); - } catch (IOException e) { - throw new RuntimeException("Unable to read default configuration", - e); - } catch (TikaException e) { - throw new RuntimeException( - "Unable to access default configuration", e); - } - } - - /** - * @deprecated This method will be removed in Apache Tika 1.0 - * @see <a - * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> - */ - public static TikaConfig getDefaultConfig(Parser delegate) - throws TikaException { - return getDefaultConfig(); - } - - private static DocumentBuilder getBuilder() throws TikaException { - try { - return DocumentBuilderFactory.newInstance().newDocumentBuilder(); - } catch (ParserConfigurationException e) { - throw new TikaException("XML parser not available", e); - } - } - - private static Element getChild(Element element, String name) { - Node child = element.getFirstChild(); - while (child != null) { - if (child.getNodeType() == Node.ELEMENT_NODE - && name.equals(child.getNodeName())) { - return (Element) child; - } - child = child.getNextSibling(); - } - return null; + public TikaConfig(String file) throws TikaException, IOException, + SAXException { + this(new File(file)); + } + + public TikaConfig(File file) throws TikaException, IOException, SAXException { + this(getBuilder().parse(file)); + } + + public TikaConfig(URL url) throws TikaException, IOException, SAXException { + this(getBuilder().parse(url.toString())); + } + + public TikaConfig(InputStream stream) throws TikaException, IOException, + SAXException { + this(getBuilder().parse(stream)); + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public TikaConfig(InputStream stream, Parser delegate) throws TikaException, + IOException, SAXException { + this(stream); + } + + public TikaConfig(Document document) throws TikaException, IOException { + this(document.getDocumentElement()); + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public TikaConfig(Document document, Parser delegate) throws TikaException, + IOException { + this(document); + } + + public TikaConfig(Element element) throws TikaException, IOException { + Element mtr = getChild(element, "mimeTypeRepository"); + if (mtr != null && mtr.hasAttribute("resource")) { + mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource")); + } else { + mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); + } + + NodeList nodes = element.getElementsByTagName("parser"); + for (int i = 0; i < nodes.getLength(); i++) { + Element node = (Element) nodes.item(i); + String name = node.getAttribute("class"); + + try { + Class<?> parserClass = Class.forName(name); + Object instance = parserClass.newInstance(); + if (!(instance instanceof Parser)) { + throw new TikaException("Configured class is not a Tika Parser: " + + name); + } + Parser parser = (Parser) instance; + + NodeList mimes = node.getElementsByTagName("mime"); + if (mimes.getLength() > 0) { + for (int j = 0; j < mimes.getLength(); j++) { + parsers.put(getText(mimes.item(j)).trim(), parser); + } + } else { + ParseContext context = new ParseContext(); + for (MediaType type : parser.getSupportedTypes(context)) { + parsers.put(type.toString(), parser); + } + } + } catch (ClassNotFoundException e) { + throw new TikaException("Configured parser class not found: " + name, e); + } catch (IllegalAccessException e) { + throw new TikaException("Unable to access a parser class: " + name, e); + } catch (InstantiationException e) { + throw new TikaException( + "Unable to instantiate a parser class: " + name, e); + } + } + } + + public TikaConfig() throws MimeTypeException, IOException { + ParseContext context = new ParseContext(); + Iterator<Parser> iterator = ServiceRegistry.lookupProviders(Parser.class, + this.getClass().getClassLoader()); + while (iterator.hasNext()) { + Parser parser = iterator.next(); + for (MediaType type : parser.getSupportedTypes(context)) { + parsers.put(type.toString(), parser); + } + } + mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public TikaConfig(Element element, Parser delegate) throws TikaException, + IOException { + this(element); + } + + private String getText(Node node) { + if (node.getNodeType() == Node.TEXT_NODE) { + return node.getNodeValue(); + } else if (node.getNodeType() == Node.ELEMENT_NODE) { + StringBuilder builder = new StringBuilder(); + NodeList list = node.getChildNodes(); + for (int i = 0; i < list.getLength(); i++) { + builder.append(getText(list.item(i))); + } + return builder.toString(); + } else { + return ""; + } + } + + /** + * Returns the parser instance configured for the given MIME type. Returns + * <code>null</code> if the given MIME type is unknown. + * + * @param mimeType + * MIME type + * @return configured Parser instance, or <code>null</code> + */ + public Parser getParser(String mimeType) { + return parsers.get(mimeType); + } + + public Map<String, Parser> getParsers() { + return parsers; + } + + public MimeTypes getMimeRepository() { + return mimeTypes; + } + + /** + * Provides a default configuration (TikaConfig). Currently creates a new + * instance each time it's called; we may be able to have it return a shared + * instance once it is completely immutable. + * + * @return default configuration + */ + public static TikaConfig getDefaultConfig() { + try { + return new TikaConfig(); + } catch (IOException e) { + throw new RuntimeException("Unable to read default configuration", e); + } catch (TikaException e) { + throw new RuntimeException("Unable to access default configuration", e); + } + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public static TikaConfig getDefaultConfig(Parser delegate) + throws TikaException { + return getDefaultConfig(); + } + + private static DocumentBuilder getBuilder() throws TikaException { + try { + return DocumentBuilderFactory.newInstance().newDocumentBuilder(); + } catch (ParserConfigurationException e) { + throw new TikaException("XML parser not available", e); + } + } + + private static Element getChild(Element element, String name) { + Node child = element.getFirstChild(); + while (child != null) { + if (child.getNodeType() == Node.ELEMENT_NODE + && name.equals(child.getNodeName())) { + return (Element) child; + } + child = child.getNextSibling(); } + return null; + } } \ No newline at end of file Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java Fri Jan 9 06:34:33 2015 @@ -26,40 +26,42 @@ package org.apache.nutch.parse.tika; /** - * Class used to verify whether the specified <var>ch</var> - * conforms to the XML 1.0 definition of whitespace. + * Class used to verify whether the specified <var>ch</var> conforms to the XML + * 1.0 definition of whitespace. */ -class XMLCharacterRecognizer -{ +class XMLCharacterRecognizer { /** - * Returns whether the specified <var>ch</var> conforms to the XML 1.0 definition - * of whitespace. Refer to <A href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> - * the definition of <CODE>S</CODE></A> for details. - * @param ch Character to check as XML whitespace. + * Returns whether the specified <var>ch</var> conforms to the XML 1.0 + * definition of whitespace. Refer to <A + * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of + * <CODE>S</CODE></A> for details. + * + * @param ch + * Character to check as XML whitespace. * @return =true if <var>ch</var> is XML whitespace; otherwise =false. */ - static boolean isWhiteSpace(char ch) - { + static boolean isWhiteSpace(char ch) { return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA); } /** * Tell if the string is whitespace. - * - * @param ch Character array to check as XML whitespace. - * @param start Start index of characters in the array - * @param length Number of characters in the array - * @return True if the characters in the array are - * XML whitespace; otherwise, false. + * + * @param ch + * Character array to check as XML whitespace. + * @param start + * Start index of characters in the array + * @param length + * Number of characters in the array + * @return True if the characters in the array are XML whitespace; otherwise, + * false. */ - static boolean isWhiteSpace(char ch[], int start, int length) - { + static boolean isWhiteSpace(char ch[], int start, int length) { int end = start + length; - for (int s = start; s < end; s++) - { + for (int s = start; s < end; s++) { if (!isWhiteSpace(ch[s])) return false; } @@ -69,39 +71,36 @@ class XMLCharacterRecognizer /** * Tell if the string is whitespace. - * - * @param buf StringBuffer to check as XML whitespace. + * + * @param buf + * StringBuffer to check as XML whitespace. * @return True if characters in buffer are XML whitespace, false otherwise */ - static boolean isWhiteSpace(StringBuffer buf) - { + static boolean isWhiteSpace(StringBuffer buf) { int n = buf.length(); - for (int i = 0; i < n; i++) - { + for (int i = 0; i < n; i++) { if (!isWhiteSpace(buf.charAt(i))) return false; } return true; } - + /** * Tell if the string is whitespace. - * - * @param s String to check as XML whitespace. + * + * @param s + * String to check as XML whitespace. * @return True if characters in buffer are XML whitespace, false otherwise */ - static boolean isWhiteSpace(String s) - { + static boolean isWhiteSpace(String s) { - if(null != s) - { + if (null != s) { int n = s.length(); - - for (int i = 0; i < n; i++) - { + + for (int i = 0; i < n; i++) { if (!isWhiteSpace(s.charAt(i))) return false; } Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * <a href="http://tika.apache.org/">Apache Tika</a>. */ package org.apache.nutch.parse.tika; +
