http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java b/nutch-plugins/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java new file mode 100644 index 0000000..a399273 --- /dev/null +++ b/nutch-plugins/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.ext; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +/** + * Unit tests for ExtParser. First creates a temp file with fixed content, then + * fetch and parse it using external command 'cat' and 'md5sum' alternately for + * 10 times. Doing so also does a light stress test for class CommandRunner.java + * (as used in ExtParser.java). + * + * Warning: currently only do test on linux platform. + * + * @author John Xing + */ +public class TestExtParser { + private File tempFile = null; + private String urlString = null; + private Content content = null; + private Parse parse = null; + + private String expectedText = "nutch rocks nutch rocks nutch rocks"; + // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum + private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526"; + + @Before + protected void setUp() throws ProtocolException, IOException { + // prepare a temp file with expectedText as its content + // This system property is defined in ./src/plugin/build-plugin.xml + String path = System.getProperty("test.data"); + if (path != null) { + File tempDir = new File(path); + if (!tempDir.exists()) + tempDir.mkdir(); + tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt", + tempDir); + } else { + // otherwise in java.io.tmpdir + tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt"); + } + urlString = tempFile.toURI().toURL().toString(); + + FileOutputStream fos = new FileOutputStream(tempFile); + fos.write(expectedText.getBytes()); + fos.close(); + + // get nutch content + Protocol protocol = new ProtocolFactory(NutchConfiguration.create()) + .getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) + .getContent(); + protocol = null; + } + + @After + protected void tearDown() { + // clean content + content = null; + + // clean temp file + // if (tempFile != null && tempFile.exists()) + // tempFile.delete(); + } + + @Test + public void testIt() throws ParseException { + String contentType; + + // now test only on linux platform + if (!System.getProperty("os.name").equalsIgnoreCase("linux")) { + System.err + .println("Current OS is " + System.getProperty("os.name") + "."); + System.err.println("No test is run on OS other than linux."); + return; + } + + Configuration conf = NutchConfiguration.create(); + // loop alternately, total 10*2 times of invoking external command + for (int i = 0; i < 10; i++) { + // check external parser that does 'cat' + contentType = "application/vnd.nutch.example.cat"; + content.setContentType(contentType); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get( + content.getUrl()); + Assert.assertEquals(expectedText, parse.getText()); + + // check external parser that does 'md5sum' + contentType = "application/vnd.nutch.example.md5sum"; + content.setContentType(contentType); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get( + content.getUrl()); + Assert.assertTrue(parse.getText().startsWith(expectedMD5sum)); + } + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/build.xml b/nutch-plugins/parse-html/build.xml new file mode 100755 index 0000000..a5b99b5 --- /dev/null +++ b/nutch-plugins/parse-html/build.xml @@ -0,0 +1,40 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-html" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-nekohtml"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-nekohtml/*.jar" /> + </fileset> + </path> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/ivy.xml b/nutch-plugins/parse-html/ivy.xml new file mode 100644 index 0000000..e8a6135 --- /dev/null +++ b/nutch-plugins/parse-html/ivy.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/plugin.xml b/nutch-plugins/parse-html/plugin.xml new file mode 100755 index 0000000..3be70c3 --- /dev/null +++ b/nutch-plugins/parse-html/plugin.xml @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-html" + name="Html Parse Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-html.jar"> + <export name="*"/> + </library> + <library name="tagsoup-1.2.1.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-nekohtml"/> + </requires> + + <extension id="org.apache.nutch.parse.html" + name="HtmlParse" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.html.HtmlParser" + class="org.apache.nutch.parse.html.HtmlParser"> + <parameter name="contentType" value="text/html|application/xhtml+xml"/> + <parameter name="pathSuffix" value=""/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/pom.xml b/nutch-plugins/parse-html/pom.xml new file mode 100644 index 0000000..589155b --- /dev/null +++ b/nutch-plugins/parse-html/pom.xml @@ -0,0 +1,49 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-html</artifactId> + <packaging>jar</packaging> + + <name>parse-html</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>org.ccil.cowan.tagsoup</groupId> <artifactId>tagsoup</artifactId> <version>1.2.1</version> + </dependency> + <dependency> + <groupId> net.sourceforge.nekohtml</groupId> + <artifactId>nekohtml</artifactId> + <version>1.9.22</version> + </dependency> + + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java new file mode 100644 index 0000000..6a1038b --- /dev/null +++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java @@ -0,0 +1,766 @@ +/* + * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to + * avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id$ + */ +package org.apache.nutch.parse.html; + +import java.util.Stack; + +import org.w3c.dom.Comment; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.Text; +import org.w3c.dom.CDATASection; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.ext.LexicalHandler; + +/** + * This class takes SAX events (in addition to some extra events that SAX + * doesn't handle yet) and adds the result to a document or document fragment. + */ +public class DOMBuilder implements ContentHandler, LexicalHandler { + + /** Root document */ + public Document m_doc; + + /** Current node */ + protected Node m_currentNode = null; + + /** First node of document fragment or null if not a DocumentFragment */ + public DocumentFragment m_docFrag = null; + + /** Vector of element nodes */ + protected Stack<Element> m_elemStack = new Stack<Element>(); + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param node + * Current node + */ + public DOMBuilder(Document doc, Node node) { + m_doc = doc; + m_currentNode = node; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param docFrag + * Document fragment + */ + public DOMBuilder(Document doc, DocumentFragment docFrag) { + m_doc = doc; + m_docFrag = docFrag; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document. + * + * @param doc + * Root document + */ + public DOMBuilder(Document doc) { + m_doc = doc; + } + + /** + * Get the root node of the DOM being created. This is either a Document or a + * DocumentFragment. + * + * @return The root document or document fragment if not null + */ + public Node getRootNode() { + return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; + } + + /** + * Get the node currently being processed. + * + * @return the current node being processed + */ + public Node getCurrentNode() { + return m_currentNode; + } + + /** + * Return null since there is no Writer for this class. + * + * @return null + */ + public java.io.Writer getWriter() { + return null; + } + + /** + * Append a node to the current container. + * + * @param newNode + * New node to append + */ + protected void append(Node newNode) throws org.xml.sax.SAXException { + + Node currentNode = m_currentNode; + + if (null != currentNode) { + currentNode.appendChild(newNode); + + // System.out.println(newNode.getNodeName()); + } else if (null != m_docFrag) { + m_docFrag.appendChild(newNode); + } else { + boolean ok = true; + short type = newNode.getNodeType(); + + if (type == Node.TEXT_NODE) { + String data = newNode.getNodeValue(); + + if ((null != data) && (data.trim().length() > 0)) { + throw new org.xml.sax.SAXException( + "Warning: can't output text before document element! Ignoring..."); + } + + ok = false; + } else if (type == Node.ELEMENT_NODE) { + if (m_doc.getDocumentElement() != null) { + throw new org.xml.sax.SAXException( + "Can't have more than one root on a DOM!"); + } + } + + if (ok) + m_doc.appendChild(newNode); + } + } + + /** + * Receive an object for locating the origin of SAX document events. + * + * <p> + * SAX parsers are strongly encouraged (though not absolutely required) to + * supply a locator: if it does so, it must supply the locator to the + * application by invoking this method before invoking any of the other + * methods in the ContentHandler interface. + * </p> + * + * <p> + * The locator allows the application to determine the end position of any + * document-related event, even if the parser is not reporting an error. + * Typically, the application will use this information for reporting its own + * errors (such as character content that does not match an application's + * business rules). The information returned by the locator is probably not + * sufficient for use with a search engine. + * </p> + * + * <p> + * Note that the locator will return correct information only during the + * invocation of the events in this interface. The application should not + * attempt to use it at any other time. + * </p> + * + * @param locator + * An object that can return the location of any SAX document event. + * @see org.xml.sax.Locator + */ + public void setDocumentLocator(Locator locator) { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of a document. + * + * <p> + * The SAX parser will invoke this method only once, before any other methods + * in this interface or in DTDHandler (except for setDocumentLocator). + * </p> + */ + public void startDocument() throws org.xml.sax.SAXException { + + // No action for the moment. + } + + /** + * Receive notification of the end of a document. + * + * <p> + * The SAX parser will invoke this method only once, and it will be the last + * method invoked during the parse. The parser shall not invoke this method + * until it has either abandoned parsing (because of an unrecoverable error) + * or reached the end of input. + * </p> + */ + public void endDocument() throws org.xml.sax.SAXException { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of an element. + * + * <p> + * The Parser will invoke this method at the beginning of every element in the + * XML document; there will be a corresponding endElement() event for every + * startElement() event (even when the element is empty). All of the element's + * content will be reported, in order, before the corresponding endElement() + * event. + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached. Note that the attribute list provided will contain only + * attributes with explicit values (specified or defaulted): #IMPLIED + * attributes will be omitted. + * </p> + * + * + * @param ns + * The namespace of the node + * @param localName + * The local part of the qualified name + * @param name + * The element name. + * @param atts + * The attributes attached to the element, if any. + * @see #endElement + * @see org.xml.sax.Attributes + */ + public void startElement(String ns, String localName, String name, + Attributes atts) throws org.xml.sax.SAXException { + + Element elem; + + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. + if ((null == ns) || (ns.length() == 0)) + elem = m_doc.createElementNS(null, name); + else + elem = m_doc.createElementNS(ns, name); + + append(elem); + + try { + int nAtts = atts.getLength(); + + if (0 != nAtts) { + for (int i = 0; i < nAtts; i++) { + + // System.out.println("type " + atts.getType(i) + " name " + + // atts.getLocalName(i) ); + // First handle a possible ID attribute + if (atts.getType(i).equalsIgnoreCase("ID")) + setIDAttribute(atts.getValue(i), elem); + + String attrNS = atts.getURI(i); + + if ("".equals(attrNS)) + attrNS = null; // DOM represents no-namespace as null + + // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // Crimson won't let us set an xmlns: attribute on the DOM. + String attrQName = atts.getQName(i); + + // In SAX, xmlns: attributes have an empty namespace, while in DOM + // they should have the xmlns namespace + if (attrQName.startsWith("xmlns:")) + attrNS = "http://www.w3.org/2000/xmlns/"; + + // ALWAYS use the DOM Level 2 call! + elem.setAttributeNS(attrNS, attrQName, atts.getValue(i)); + } + } + + // append(elem); + + m_elemStack.push(elem); + + m_currentNode = elem; + + // append(elem); + } catch (java.lang.Exception de) { + // de.printStackTrace(); + throw new org.xml.sax.SAXException(de); + } + + } + + /** + * + * + * + * Receive notification of the end of an element. + * + * <p> + * The SAX parser will invoke this method at the end of every element in the + * XML document; there will be a corresponding startElement() event for every + * endElement() event (even when the element is empty). + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached to the name. + * </p> + * + * + * @param ns + * the namespace of the element + * @param localName + * The local part of the qualified name of the element + * @param name + * The element name + */ + public void endElement(String ns, String localName, String name) + throws org.xml.sax.SAXException { + m_elemStack.pop(); + m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek(); + } + + /** + * Set an ID string to node association in the ID table. + * + * @param id + * The ID string. + * @param elem + * The associated ID. + */ + public void setIDAttribute(String id, Element elem) { + + // Do nothing. This method is meant to be overiden. + } + + /** + * Receive notification of character data. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void characters(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + if (m_inCData) { + cdata(ch, start, length); + + return; + } + + String s = new String(ch, start, length); + Node childNode; + childNode = m_currentNode != null ? m_currentNode.getLastChild() : null; + if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) { + ((Text) childNode).appendData(s); + } else { + Text text = m_doc.createTextNode(s); + append(text); + } + } + + /** + * If available, when the disable-output-escaping attribute is used, output + * raw text without escaping. A PI will be inserted in front of the node with + * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom". + * + * @param ch + * Array containing the characters + * @param start + * Index to start of characters in the array + * @param length + * Number of characters in the array + */ + public void charactersRaw(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createProcessingInstruction("xslt-next-is-raw", + "formatter-to-dom")); + append(m_doc.createTextNode(s)); + } + + /** + * Report the beginning of an entity. + * + * The start and end of the document entity are not reported. The start and + * end of the external DTD subset are reported using the pseudo-name "[dtd]". + * All other events must be properly nested within start/end entity events. + * + * @param name + * The name of the entity. If it is a parameter entity, the name will + * begin with '%'. + * @see #endEntity + * @see org.xml.sax.ext.DeclHandler#internalEntityDecl + * @see org.xml.sax.ext.DeclHandler#externalEntityDecl + */ + public void startEntity(String name) throws org.xml.sax.SAXException { + + // Almost certainly the wrong behavior... + // entityReference(name); + } + + /** + * Report the end of an entity. + * + * @param name + * The name of the entity that is ending. + * @see #startEntity + */ + public void endEntity(String name) throws org.xml.sax.SAXException { + } + + /** + * Receive notivication of a entityReference. + * + * @param name + * name of the entity reference + */ + public void entityReference(String name) throws org.xml.sax.SAXException { + append(m_doc.createEntityReference(name)); + } + + /** + * Receive notification of ignorable whitespace in element content. + * + * <p> + * Validating Parsers must use this method to report each chunk of ignorable + * whitespace (see the W3C XML 1.0 recommendation, section 2.10): + * non-validating parsers may also use this method if they are capable of + * parsing and using content models. + * </p> + * + * <p> + * SAX parsers may return all contiguous whitespace in a single chunk, or they + * may split it into several chunks; however, all of the characters in any + * single event must come from the same external entity, so that the Locator + * provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #characters + */ + public void ignorableWhitespace(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createTextNode(s)); + } + + /** + * Tell if the current node is outside the document element. + * + * @return true if the current node is outside the document element. + */ + private boolean isOutsideDocElem() { + return (null == m_docFrag) + && m_elemStack.size() == 0 + && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } + + /** + * Receive notification of a processing instruction. + * + * <p> + * The Parser will invoke this method once for each processing instruction + * found: note that processing instructions may occur before or after the main + * document element. + * </p> + * + * <p> + * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) + * or a text declaration (XML 1.0, section 4.3.1) using this method. + * </p> + * + * @param target + * The processing instruction target. + * @param data + * The processing instruction data, or null if none was supplied. + */ + public void processingInstruction(String target, String data) + throws org.xml.sax.SAXException { + append(m_doc.createProcessingInstruction(target, data)); + } + + /** + * Report an XML comment anywhere in the document. + * + * This callback will be used for comments inside or outside the document + * element, including comments in the external DTD subset (if read). + * + * @param ch + * An array holding the characters in the comment. + * @param start + * The starting position in the array. + * @param length + * The number of characters to use from the array. + */ + public void comment(char ch[], int start, int length) + throws org.xml.sax.SAXException { + // tagsoup sometimes submits invalid values here + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) + return; + append(m_doc.createComment(new String(ch, start, length))); + } + + /** Flag indicating that we are processing a CData section */ + protected boolean m_inCData = false; + + /** + * Report the start of a CDATA section. + * + * @see #endCDATA + */ + public void startCDATA() throws org.xml.sax.SAXException { + m_inCData = true; + append(m_doc.createCDATASection("")); + } + + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + public void endCDATA() throws org.xml.sax.SAXException { + m_inCData = false; + } + + /** + * Receive notification of cdata. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void cdata(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + // XXX [email protected]: modified from the original, to accomodate TagSoup. + Node n = m_currentNode.getLastChild(); + if (n instanceof CDATASection) + ((CDATASection) n).appendData(s); + else if (n instanceof Comment) + ((Comment) n).appendData(s); + } + + /** + * Report the start of DTD declarations, if any. + * + * Any declarations are assumed to be in the internal subset unless otherwise + * indicated. + * + * @param name + * The document type name. + * @param publicId + * The declared public identifier for the external DTD subset, or + * null if none was declared. + * @param systemId + * The declared system identifier for the external DTD subset, or + * null if none was declared. + * @see #endDTD + * @see #startEntity + */ + public void startDTD(String name, String publicId, String systemId) + throws org.xml.sax.SAXException { + + // Do nothing for now. + } + + /** + * Report the end of DTD declarations. + * + * @see #startDTD + */ + public void endDTD() throws org.xml.sax.SAXException { + + // Do nothing for now. + } + + /** + * Begin the scope of a prefix-URI Namespace mapping. + * + * <p> + * The information from this event is not necessary for normal Namespace + * processing: the SAX XML reader will automatically replace prefixes for + * element and attribute names when the http://xml.org/sax/features/namespaces + * feature is true (the default). + * </p> + * + * <p> + * There are cases, however, when applications need to use prefixes in + * character data or in attribute values, where they cannot safely be expanded + * automatically; the start/endPrefixMapping event supplies the information to + * the application to expand prefixes in those contexts itself, if necessary. + * </p> + * + * <p> + * Note that start/endPrefixMapping events are not guaranteed to be properly + * nested relative to each-other: all startPrefixMapping events will occur + * before the corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, but their order + * is not guaranteed. + * </p> + * + * @param prefix + * The Namespace prefix being declared. + * @param uri + * The Namespace URI the prefix is mapped to. + * @see #endPrefixMapping + * @see #startElement + */ + public void startPrefixMapping(String prefix, String uri) + throws org.xml.sax.SAXException { + + /* + * // Not sure if this is needed or wanted // Also, it fails in the stree. + * if((null != m_currentNode) && (m_currentNode.getNodeType() == + * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) && + * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname + * = "xmlns:"+prefix; + * + * Element elem = (Element)m_currentNode; String val = + * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null) + * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname, + * uri); } } + */ + } + + /** + * End the scope of a prefix-URI mapping. + * + * <p> + * See startPrefixMapping for details. This event will always occur after the + * corresponding endElement event, but the order of endPrefixMapping events is + * not otherwise guaranteed. + * </p> + * + * @param prefix + * The prefix that was being mapping. + * @see #startPrefixMapping + * @see #endElement + */ + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { + } + + /** + * Receive notification of a skipped entity. + * + * <p> + * The Parser will invoke this method once for each entity skipped. + * Non-validating processors may skip entities if they have not seen the + * declarations (because, for example, the entity was declared in an external + * DTD subset). All processors may skip external entities, depending on the + * values of the http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities properties. + * </p> + * + * @param name + * The name of the skipped entity. If it is a parameter entity, the + * name will begin with '%'. + */ + public void skippedEntity(String name) throws org.xml.sax.SAXException { + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java new file mode 100644 index 0000000..3c2aba0 --- /dev/null +++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java @@ -0,0 +1,400 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.net.URL; +import java.net.MalformedURLException; +import java.util.Collection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Stack; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.util.NodeWalker; +import org.apache.nutch.util.URLUtil; +import org.apache.hadoop.conf.Configuration; + +import org.w3c.dom.*; + +/** + * A collection of methods for extracting content from DOM trees. + * + * This class holds a few utility methods for pulling content out of DOM nodes, + * such as getOutlinks, getText, etc. + * + */ +public class DOMContentUtils { + + public static class LinkParams { + public String elName; + public String attrName; + public int childLen; + + public LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } + } + + private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); + private Configuration conf; + + public DOMContentUtils(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + // forceTags is used to override configurable tag ignoring, later on + Collection<String> forceTags = new ArrayList<String>(1); + + this.conf = conf; + linkParams.clear(); + linkParams.put("a", new LinkParams("a", "href", 1)); + linkParams.put("area", new LinkParams("area", "href", 0)); + if (conf.getBoolean("parser.html.form.use_action", true)) { + linkParams.put("form", new LinkParams("form", "action", 1)); + if (conf.get("parser.html.form.use_action") != null) + forceTags.add("form"); + } + linkParams.put("frame", new LinkParams("frame", "src", 0)); + linkParams.put("iframe", new LinkParams("iframe", "src", 0)); + linkParams.put("script", new LinkParams("script", "src", 0)); + linkParams.put("link", new LinkParams("link", "href", 0)); + linkParams.put("img", new LinkParams("img", "src", 0)); + + // remove unwanted link tags from the linkParams map + String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); + for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { + if (!forceTags.contains(ignoreTags[i])) + linkParams.remove(ignoreTags[i]); + } + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append all the content text found beneath the DOM node to the + * <code>StringBuffer</code>. + * + * <p> + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted + * and the <code>StringBuffer</code> will not contain any text encountered + * after a nested anchor is found. + * + * <p> + * + * @return true if nested anchors were found + */ + public boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + return true; + } + return false; + } + + /** + * This is a convinience method, equivalent to + * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * + */ + public void getText(StringBuffer sb, Node node) { + getText(sb, node, false); + } + + // returns true if abortOnNestedAnchors is true and we find nested + // anchors + private boolean getTextHelper(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, int anchorDepth) { + boolean abort = false; + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("script".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if ("style".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { + anchorDepth++; + if (anchorDepth > 1) { + abort = true; + break; + } + } + if (nodeType == Node.COMMENT_NODE) { + walker.skipChildren(); + } + if (nodeType == Node.TEXT_NODE) { + // cleanup and trim the value + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + text = text.trim(); + if (text.length() > 0) { + if (sb.length() > 0) + sb.append(' '); + sb.append(text); + } + } + } + + return abort; + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append the content text found beneath the first <code>title</code> node to + * the <code>StringBuffer</code>. + * + * @return true if a title node was found, false otherwise + */ + public boolean getTitle(StringBuffer sb, Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return false; + } + + if (nodeType == Node.ELEMENT_NODE) { + if ("title".equalsIgnoreCase(nodeName)) { + getText(sb, currentNode); + return true; + } + } + } + + return false; + } + + /** If Node contains a BASE tag then it's HREF is returned. */ + public URL getBase(Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + // is this node a BASE tag? + if (nodeType == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return null; + } + + if ("base".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + if ("href".equalsIgnoreCase(attr.getNodeName())) { + try { + return new URL(attr.getNodeValue()); + } catch (MalformedURLException e) { + } + } + } + } + } + } + + // no. + return null; + } + + private boolean hasOnlyWhiteSpace(Node node) { + String val = node.getNodeValue(); + for (int i = 0; i < val.length(); i++) { + if (!Character.isWhitespace(val.charAt(i))) + return false; + } + return true; + } + + // this only covers a few cases of empty links that are symptomatic + // of nekohtml's DOM-fixup process... + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { + if (childLen == 0) { + // this has no inner structure + if (params.childLen == 0) + return false; + else + return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // single nested link + return true; + + } else if (childLen == 2) { + + Node c0 = children.item(0); + Node c1 = children.item(1); + + if ((c0.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c0.getNodeName())) + && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { + // single link followed by whitespace node + return true; + } + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { + // whitespace node followed by single link + return true; + } + + } else if (childLen == 3) { + Node c0 = children.item(0); + Node c1 = children.item(1); + Node c2 = children.item(2); + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2)) { + // single link surrounded by whitespace nodes + return true; + } + } + + return false; + } + + /** + * This method finds all anchors below the supplied DOM <code>node</code>, and + * creates appropriate {@link Outlink} records for each (relative to the + * supplied <code>base</code> URL), and adds them to the <code>outlinks</code> + * {@link ArrayList}. + * + * <p> + * + * Links without inner structure (tags, text, etc) are discarded, as are links + * which contain only single nested links and empty text nodes (this is a + * common DOM-fixup artifact, at least with nekohtml). + */ + public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) { + + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + NodeList children = currentNode.getChildNodes(); + int childLen = (children != null) ? children.getLength() : 0; + + if (nodeType == Node.ELEMENT_NODE) { + + nodeName = nodeName.toLowerCase(); + LinkParams params = (LinkParams) linkParams.get(nodeName); + if (params != null) { + if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { + + StringBuffer linkText = new StringBuffer(); + getText(linkText, currentNode, true); + if (linkText.toString().trim().length() == 0) { + // try harder - use img alt if present + NodeWalker subWalker = new NodeWalker(currentNode); + while (subWalker.hasNext()) { + Node subNode = subWalker.nextNode(); + if (subNode.getNodeType() == Node.ELEMENT_NODE) { + if (subNode.getNodeName().toLowerCase().equals("img")) { + NamedNodeMap subAttrs = subNode.getAttributes(); + Node alt = subAttrs.getNamedItem("alt"); + if (alt != null) { + String altTxt = alt.getTextContent(); + if (altTxt != null && altTxt.trim().length() > 0) { + if (linkText.length() > 0) + linkText.append(' '); + linkText.append(altTxt); + } + } + } else { + // ignore other types of elements + + } + } else if (subNode.getNodeType() == Node.TEXT_NODE) { + String txt = subNode.getTextContent(); + if (txt != null && txt.length() > 0) { + if (linkText.length() > 0) + linkText.append(' '); + linkText.append(txt); + } + } + } + } + + NamedNodeMap attrs = currentNode.getAttributes(); + String target = null; + boolean noFollow = false; + boolean post = false; + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); + } else if ("rel".equalsIgnoreCase(attrName) + && "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } else if ("method".equalsIgnoreCase(attrName) + && "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; + } + } + if (target != null && !noFollow && !post) + try { + + URL url = URLUtil.resolveURL(base, target); + outlinks.add(new Outlink(url.toString(), linkText.toString() + .trim())); + } catch (MalformedURLException e) { + // don't care + } + } + // this should not have any children, skip them + if (params.childLen == 0) + continue; + } + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java new file mode 100644 index 0000000..159aa76 --- /dev/null +++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java @@ -0,0 +1,214 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.net.URL; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.w3c.dom.*; + +/** + * Class for parsing META Directives from DOM trees. This class handles + * specifically Robots META directives (all, none, nofollow, noindex), finding + * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are + * stored in a HTMLMetaTags instance. + */ +public class HTMLMetaProcessor { + + /** + * Utility class with indicators for the robots directives "noindex" and + * "nofollow", and HTTP-EQUIV/no-cache + */ + + /** + * Sets the indicators in <code>robotsMeta</code> to appropriate values, based + * on any META tags found under the given <code>node</code>. + */ + public static final void getMetaTags(HTMLMetaTags metaTags, Node node, + URL currURL) { + + metaTags.reset(); + getMetaTagsHelper(metaTags, node, currURL); + } + + private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node, + URL currURL) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(node.getNodeName())) { + // META tags should not be under body + return; + } + + if ("meta".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node nameNode = null; + Node equivNode = null; + Node contentNode = null; + // Retrieves name, http-equiv and content attribues + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName().toLowerCase(); + if (attrName.equals("name")) { + nameNode = attr; + } else if (attrName.equals("http-equiv")) { + equivNode = attr; + } else if (attrName.equals("content")) { + contentNode = attr; + } + } + + if (nameNode != null) { + if (contentNode != null) { + String name = nameNode.getNodeValue().toLowerCase(); + metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); + if ("robots".equals(name)) { + + if (contentNode != null) { + String directives = contentNode.getNodeValue().toLowerCase(); + int index = directives.indexOf("none"); + + if (index >= 0) { + metaTags.setNoIndex(); + metaTags.setNoFollow(); + } + + index = directives.indexOf("all"); + if (index >= 0) { + // do nothing... + } + + index = directives.indexOf("noindex"); + if (index >= 0) { + metaTags.setNoIndex(); + } + + index = directives.indexOf("nofollow"); + if (index >= 0) { + metaTags.setNoFollow(); + } + + index = directives.indexOf("noarchive"); + if (index >= 0) { + metaTags.setNoCache(); + } + } + + } // end if (name == robots) + } + } + + if (equivNode != null) { + if (contentNode != null) { + String name = equivNode.getNodeValue().toLowerCase(); + String content = contentNode.getNodeValue(); + metaTags.getHttpEquivTags().setProperty(name, content); + if ("pragma".equals(name)) { + content = content.toLowerCase(); + int index = content.indexOf("no-cache"); + if (index >= 0) + metaTags.setNoCache(); + } else if ("refresh".equals(name)) { + int idx = content.indexOf(';'); + String time = null; + if (idx == -1) { // just the refresh time + time = content; + } else + time = content.substring(0, idx); + try { + metaTags.setRefreshTime(Integer.parseInt(time)); + // skip this if we couldn't parse the time + metaTags.setRefresh(true); + } catch (Exception e) { + ; + } + URL refreshUrl = null; + if (metaTags.getRefresh() && idx != -1) { // set the URL + idx = content.toLowerCase().indexOf("url="); + if (idx == -1) { // assume a mis-formatted entry with just the + // url + idx = content.indexOf(';') + 1; + } else + idx += 4; + if (idx != -1) { + String url = content.substring(idx); + try { + refreshUrl = new URL(url); + } catch (Exception e) { + // XXX according to the spec, this has to be an absolute + // XXX url. However, many websites use relative URLs and + // XXX expect browsers to handle that. + // XXX Unfortunately, in some cases this may create a + // XXX infinitely recursive paths (a crawler trap)... + // if (!url.startsWith("/")) url = "/" + url; + try { + refreshUrl = new URL(currURL, url); + } catch (Exception e1) { + refreshUrl = null; + } + } + } + } + if (metaTags.getRefresh()) { + if (refreshUrl == null) { + // apparently only refresh time was present. set the URL + // to the same URL. + refreshUrl = currURL; + } + metaTags.setRefreshHref(refreshUrl); + } + } + } + } + + } else if ("base".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node hrefNode = attrs.getNamedItem("href"); + + if (hrefNode != null) { + String urlString = hrefNode.getNodeValue(); + + URL url = null; + try { + if (currURL == null) + url = new URL(urlString); + else + url = new URL(currURL, urlString); + } catch (Exception e) { + ; + } + + if (url != null) + metaTags.setBaseHref(url); + } + + } + + } + + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + getMetaTagsHelper(metaTags, children.item(i), currURL); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java new file mode 100644 index 0000000..4d043ba --- /dev/null +++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java @@ -0,0 +1,352 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.util.ArrayList; +import java.util.Map; +import java.net.URL; +import java.net.MalformedURLException; +import java.nio.charset.StandardCharsets; +import java.io.*; +import java.util.regex.*; + +import org.cyberneko.html.parsers.*; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.w3c.dom.*; +import org.apache.html.dom.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Content; +import org.apache.hadoop.conf.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.util.*; + +public class HtmlParser implements Parser { + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.html"); + + // I used 1000 bytes at first, but found that some documents have + // meta tag well past the first 1000 bytes. + // (e.g. http://cn.promo.yahoo.com/customcare/music.html) + // NUTCH-2042 (cf. TIKA-357): increased to 8 kB + private static final int CHUNK_SIZE = 8192; + + // NUTCH-1006 Meta equiv with single quotes not accepted + private static Pattern metaPattern = Pattern.compile( + "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", + Pattern.CASE_INSENSITIVE); + private static Pattern charsetPattern = Pattern.compile( + "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); + private static Pattern charsetPatternHTML5 = Pattern.compile( + "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", + Pattern.CASE_INSENSITIVE); + + private String parserImpl; + + /** + * Given a <code>byte[]</code> representing an html file of an + * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag + * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for + * Content-Type or no charset is specified, the content is checked for a + * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented + * character encodings (UTF-16 only). If no character set can be determined, + * <code>null</code> is returned. <br /> + * See also + * http://www.w3.org/International/questions/qa-html-encoding-declarations, + * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and + * http://www.w3.org/TR/REC-xml/#sec-guessing + * + * @param content + * <code>byte[]</code> representation of an html file + */ + + private static String sniffCharacterEncoding(byte[] content) { + int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE; + + // We don't care about non-ASCII parts so that it's sufficient + // to just inflate each byte to a 16-bit value by padding. + // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into + // {U+0041, U+0082, U+00B7}. + String str = new String(content, 0, length, StandardCharsets.US_ASCII); + + Matcher metaMatcher = metaPattern.matcher(str); + String encoding = null; + if (metaMatcher.find()) { + Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); + if (charsetMatcher.find()) + encoding = new String(charsetMatcher.group(1)); + } + if (encoding == null) { + // check for HTML5 meta charset + metaMatcher = charsetPatternHTML5.matcher(str); + if (metaMatcher.find()) { + encoding = new String(metaMatcher.group(1)); + } + } + if (encoding == null) { + // check for BOM + if (content.length >= 3 && content[0] == (byte) 0xEF + && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) { + encoding = "UTF-8"; + } else if (content.length >= 2) { + if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) { + encoding = "UTF-16LE"; + } else if (content[0] == (byte) 0xFE && content[1] == (byte) 0xFF) { + encoding = "UTF-16BE"; + } + } + } + + return encoding; + } + + private String defaultCharEncoding; + + private Configuration conf; + + private DOMContentUtils utils; + + private HtmlParseFilters htmlParseFilters; + + private String cachingPolicy; + + public ParseResult getParse(Content content) { + HTMLMetaTags metaTags = new HTMLMetaTags(); + + URL base; + try { + base = new URL(content.getBaseUrl()); + } catch (MalformedURLException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + String text = ""; + String title = ""; + Outlink[] outlinks = new Outlink[0]; + Metadata metadata = new Metadata(); + + // parse the content + DocumentFragment root; + try { + byte[] contentInOctets = content.getContent(); + InputSource input = new InputSource(new ByteArrayInputStream( + contentInOctets)); + + EncodingDetector detector = new EncodingDetector(conf); + detector.autoDetectClues(content, true); + detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed"); + String encoding = detector.guessEncoding(content, defaultCharEncoding); + + metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding); + metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding); + + input.setEncoding(encoding); + if (LOG.isTraceEnabled()) { + LOG.trace("Parsing..."); + } + root = parse(input); + } catch (IOException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } catch (DOMException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } catch (SAXException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } catch (Exception e) { + LOG.error("Error: ", e); + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + // get meta directives + HTMLMetaProcessor.getMetaTags(metaTags, root, base); + + // populate Nutch metadata with HTML meta directives + metadata.addAll(metaTags.getGeneralTags()); + + if (LOG.isTraceEnabled()) { + LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); + } + // check meta directives + if (!metaTags.getNoIndex()) { // okay to index + StringBuffer sb = new StringBuffer(); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting text..."); + } + utils.getText(sb, root); // extract text + text = sb.toString(); + sb.setLength(0); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting title..."); + } + utils.getTitle(sb, root); // extract title + title = sb.toString().trim(); + } + + if (!metaTags.getNoFollow()) { // okay to follow links + ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks + URL baseTag = utils.getBase(root); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting links..."); + } + utils.getOutlinks(baseTag != null ? baseTag : base, l, root); + outlinks = l.toArray(new Outlink[l.size()]); + if (LOG.isTraceEnabled()) { + LOG.trace("found " + outlinks.length + " outlinks in " + + content.getUrl()); + } + } + + ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); + if (metaTags.getRefresh()) { + status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); + status.setArgs(new String[] { metaTags.getRefreshHref().toString(), + Integer.toString(metaTags.getRefreshTime()) }); + } + ParseData parseData = new ParseData(status, title, outlinks, + content.getMetadata(), metadata); + ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), + new ParseImpl(text, parseData)); + + // run filters on parse + ParseResult filteredParse = this.htmlParseFilters.filter(content, + parseResult, metaTags, root); + if (metaTags.getNoCache()) { // not okay to cache + for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) + entry.getValue().getData().getParseMeta() + .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); + } + return filteredParse; + } + + private DocumentFragment parse(InputSource input) throws Exception { + if (parserImpl.equalsIgnoreCase("tagsoup")) + return parseTagSoup(input); + else + return parseNeko(input); + } + + private DocumentFragment parseTagSoup(InputSource input) throws Exception { + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + DocumentFragment frag = doc.createDocumentFragment(); + DOMBuilder builder = new DOMBuilder(doc, frag); + org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser(); + reader.setContentHandler(builder); + reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); + reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false); + reader + .setProperty("http://xml.org/sax/properties/lexical-handler", builder); + reader.parse(input); + return frag; + } + + private DocumentFragment parseNeko(InputSource input) throws Exception { + DOMFragmentParser parser = new DOMFragmentParser(); + try { + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + parser.setFeature("http://cyberneko.org/html/features/augmentations", + true); + parser.setProperty( + "http://cyberneko.org/html/properties/default-encoding", + defaultCharEncoding); + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/ignore-specified-charset", + true); + parser + .setFeature( + "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", + false); + parser.setFeature( + "http://cyberneko.org/html/features/balance-tags/document-fragment", + true); + parser.setFeature("http://cyberneko.org/html/features/report-errors", + LOG.isTraceEnabled()); + } catch (SAXException e) { + } + // convert Document to DocumentFragment + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + doc.setErrorChecking(false); + DocumentFragment res = doc.createDocumentFragment(); + DocumentFragment frag = doc.createDocumentFragment(); + parser.parse(input, frag); + res.appendChild(frag); + + try { + while (true) { + frag = doc.createDocumentFragment(); + parser.parse(input, frag); + if (!frag.hasChildNodes()) + break; + if (LOG.isInfoEnabled()) { + LOG.info(" - new frag, " + frag.getChildNodes().getLength() + + " nodes."); + } + res.appendChild(frag); + } + } catch (Exception e) { + LOG.error("Error: ", e); + } + ; + return res; + } + + public static void main(String[] args) throws Exception { + // LOG.setLevel(Level.FINE); + String name = args[0]; + String url = "file:" + name; + File file = new File(name); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + Configuration conf = NutchConfiguration.create(); + HtmlParser parser = new HtmlParser(); + parser.setConf(conf); + Parse parse = parser.getParse( + new Content(url, url, bytes, "text/html", new Metadata(), conf)).get( + url); + System.out.println("data: " + parse.getData()); + + System.out.println("text: " + parse.getText()); + + } + + public void setConf(Configuration conf) { + this.conf = conf; + this.htmlParseFilters = new HtmlParseFilters(getConf()); + this.parserImpl = getConf().get("parser.html.impl", "neko"); + this.defaultCharEncoding = getConf().get( + "parser.character.encoding.default", "windows-1252"); + this.utils = new DOMContentUtils(conf); + this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", + Nutch.CACHING_FORBIDDEN_CONTENT); + } + + public Configuration getConf() { + return this.conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java new file mode 100644 index 0000000..eb382e8 --- /dev/null +++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java @@ -0,0 +1,112 @@ +/* + * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer, + * XXX in order to avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id$ + */ +package org.apache.nutch.parse.html; + +/** + * Class used to verify whether the specified <var>ch</var> conforms to the XML + * 1.0 definition of whitespace. + */ +public class XMLCharacterRecognizer { + + /** + * Returns whether the specified <var>ch</var> conforms to the XML 1.0 + * definition of whitespace. Refer to <A + * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of + * <CODE>S</CODE></A> for details. + * + * @param ch + * Character to check as XML whitespace. + * @return =true if <var>ch</var> is XML whitespace; otherwise =false. + */ + public static boolean isWhiteSpace(char ch) { + return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA); + } + + /** + * Tell if the string is whitespace. + * + * @param ch + * Character array to check as XML whitespace. + * @param start + * Start index of characters in the array + * @param length + * Number of characters in the array + * @return True if the characters in the array are XML whitespace; otherwise, + * false. + */ + public static boolean isWhiteSpace(char ch[], int start, int length) { + + int end = start + length; + + for (int s = start; s < end; s++) { + if (!isWhiteSpace(ch[s])) + return false; + } + + return true; + } + + /** + * Tell if the string is whitespace. + * + * @param buf + * StringBuffer to check as XML whitespace. + * @return True if characters in buffer are XML whitespace, false otherwise + */ + public static boolean isWhiteSpace(StringBuffer buf) { + + int n = buf.length(); + + for (int i = 0; i < n; i++) { + if (!isWhiteSpace(buf.charAt(i))) + return false; + } + + return true; + } + + /** + * Tell if the string is whitespace. + * + * @param s + * String to check as XML whitespace. + * @return True if characters in buffer are XML whitespace, false otherwise + */ + public static boolean isWhiteSpace(String s) { + + if (null != s) { + int n = s.length(); + + for (int i = 0; i < n; i++) { + if (!isWhiteSpace(s.charAt(i))) + return false; + } + } + + return true; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html new file mode 100644 index 0000000..c650389 --- /dev/null +++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>An HTML document parsing plugin.</p><p>This package relies on <a href="http://www.apache.org/~andyc/neko/doc/html/index.html">NekoHTML</a>.</p> +</body> +</html>
