Repository: nutch Updated Branches: refs/heads/master dce7a28c7 -> 6b8586a47
NUTCH-2248 CSS Parser plugin parse-css Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/60e5a0e6 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/60e5a0e6 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/60e5a0e6 Branch: refs/heads/master Commit: 60e5a0e626c50244588db2b097490c816ed1aa3e Parents: b62f43f Author: Joseph Naegele <[email protected]> Authored: Thu Apr 7 21:40:04 2016 +0000 Committer: Joseph Naegele <[email protected]> Committed: Mon Apr 11 19:38:06 2016 +0000 ---------------------------------------------------------------------- conf/parse-plugins.xml | 6 + src/plugin/build.xml | 3 + src/plugin/parse-css/build.xml | 27 +++ src/plugin/parse-css/ivy.xml | 42 ++++ src/plugin/parse-css/plugin.xml | 47 ++++ .../org/apache/nutch/parse/css/CssParser.java | 225 +++++++++++++++++++ .../apache/nutch/parse/css/TestCssParser.java | 121 ++++++++++ 7 files changed, 471 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/60e5a0e6/conf/parse-plugins.xml ---------------------------------------------------------------------- diff --git a/conf/parse-plugins.xml b/conf/parse-plugins.xml index 20c8724..bddd8a1 100644 --- a/conf/parse-plugins.xml +++ b/conf/parse-plugins.xml @@ -55,6 +55,10 @@ <plugin id="parse-zip" /> </mimeType> + <mimeType name="text/css"> + <plugin id="parse-css" /> + </mimeType> + <mimeType name="text/html"> <plugin id="parse-html" /> </mimeType> @@ -84,6 +88,8 @@ <alias name="parse-tika" extension-id="org.apache.nutch.parse.tika.TikaParser" /> <alias name="parse-ext" extension-id="ExtParser" /> + <alias name="parse-css" + extension-id="org.apache.nutch.parse.css.CssParser" /> <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> <alias name="parse-js" extension-id="JSParser" /> http://git-wip-us.apache.org/repos/asf/nutch/blob/60e5a0e6/src/plugin/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 10731b3..c448b62 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -58,6 +58,7 @@ <ant dir="protocol-interactiveselenium" target="deploy" /> <ant dir="parse-ext" target="deploy"/> <ant dir="parse-js" target="deploy"/> + <ant dir="parse-css" target="deploy"/> <ant dir="parse-html" target="deploy"/> <ant dir="parse-metatags" target="deploy"/> <ant dir="parse-swf" target="deploy"/> @@ -111,6 +112,7 @@ <ant dir="protocol-httpclient" target="test"/> <!--ant dir="parse-ext" target="test"/--> <ant dir="feed" target="test"/> + <ant dir="parse-css" target="test"/> <ant dir="parse-html" target="test"/> <ant dir="parse-metatags" target="test"/> <ant dir="parse-swf" target="test"/> @@ -175,6 +177,7 @@ <ant dir="protocol-interactiveselenium" target="clean" /> <ant dir="parse-ext" target="clean"/> <ant dir="parse-js" target="clean"/> + <ant dir="parse-css" target="clean"/> <ant dir="parse-html" target="clean"/> <ant dir="parse-metatags" target="clean"/> <ant dir="parse-swf" target="clean"/> http://git-wip-us.apache.org/repos/asf/nutch/blob/60e5a0e6/src/plugin/parse-css/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-css/build.xml b/src/plugin/parse-css/build.xml new file mode 100644 index 0000000..f1903fb --- /dev/null +++ b/src/plugin/parse-css/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-css" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/60e5a0e6/src/plugin/parse-css/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-css/ivy.xml b/src/plugin/parse-css/ivy.xml new file mode 100644 index 0000000..da0d90e --- /dev/null +++ b/src/plugin/parse-css/ivy.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="net.sourceforge.cssparser" name="cssparser" rev="0.9.18"/> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/60e5a0e6/src/plugin/parse-css/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-css/plugin.xml b/src/plugin/parse-css/plugin.xml new file mode 100644 index 0000000..46f274d --- /dev/null +++ b/src/plugin/parse-css/plugin.xml @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-css" + name="CSS Parser" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-css.jar"> + <export name="*"/> + </library> + <library name="cssparser-0.9.18.jar"/> + <library name="sac-1.3.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.parse.css" + name="CssParser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.css.CssParser" + class="org.apache.nutch.parse.css.CssParser"> + <parameter name="contentType" value="text/css"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/60e5a0e6/src/plugin/parse-css/src/java/org/apache/nutch/parse/css/CssParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-css/src/java/org/apache/nutch/parse/css/CssParser.java b/src/plugin/parse-css/src/java/org/apache/nutch/parse/css/CssParser.java new file mode 100644 index 0000000..21f1df5 --- /dev/null +++ b/src/plugin/parse-css/src/java/org/apache/nutch/parse/css/CssParser.java @@ -0,0 +1,225 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.css; + +import java.io.*; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import com.steadystate.css.parser.CSSOMParser; +import com.steadystate.css.parser.SACParserCSS3; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.parse.*; +import org.apache.nutch.util.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.css.sac.CSSParseException; +import org.w3c.css.sac.InputSource; +import org.w3c.dom.css.*; + + +public class CssParser implements org.apache.nutch.parse.Parser { + public static final Logger LOG = LoggerFactory.getLogger(CssParser.class); + + /** + * Suppresses warnings, logs all errors, and throws fatal errors + */ + private class ErrorHandler implements org.w3c.css.sac.ErrorHandler { + @Override + public void warning(CSSParseException exception) { } + + @Override + public void error(CSSParseException exception) { + LOG.debug("CSS parser error: {}", exception.getMessage()); + } + + @Override + public void fatalError(CSSParseException exception) { + error(exception); + throw exception; + } + } + + private Configuration config; + + /** Parses CSS stylesheets for outlinks. + * + * Extracts : + * + * - \\@import url(...) ...; + * - \\@import '...'; + * - \\@font-face src: <uri> + * - and all non-custom style property values matching url(...) + * + * Ignores: + * + * - \\@namespace <uri> + * - \\@document <url> + * + * @param content CSS resource content to be parsed + * @return result of parse + */ + @Override + public ParseResult getParse(Content content) { + CSSOMParser parser; + parser = new CSSOMParser(new SACParserCSS3()); + parser.setErrorHandler(new ErrorHandler()); + + URL base; + try { + base = new URL(content.getBaseUrl()); + } catch (MalformedURLException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + ByteArrayInputStream bis = new ByteArrayInputStream(content.getContent()); + Reader reader = new InputStreamReader(bis, StandardCharsets.UTF_8); + InputSource source = new InputSource(reader); + CSSStyleSheet sheet; + try { + sheet = parser.parseStyleSheet(source, null, content.getBaseUrl()); + } catch (IOException e) { + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } finally { + try { + reader.close(); + } catch (IOException e) { + LOG.warn("failed to close reader"); + } + } + + CSSRuleList rules = sheet.getCssRules(); + List<String> urls = new ArrayList<String>(); + for (int i = 0; i < rules.getLength(); i++) { + CSSRule rule = rules.item(i); + switch (rule.getType()) { + // @import + case CSSRule.IMPORT_RULE: + urls.add(((CSSImportRule) rule).getHref()); + break; + + // @font-face + case CSSRule.FONT_FACE_RULE: + collectStyleDeclarationOutlinks(((CSSFontFaceRule)rule).getStyle(), urls); + break; + + // normal CSS style rule + case CSSRule.STYLE_RULE: + collectStyleDeclarationOutlinks(((CSSStyleRule)rule).getStyle(), urls); + break; + + // ignore @charset, @media, @page and unknown at-rules + default: + break; + } + } + + // resolve each relative URL to create a list of Outlinks + List<Outlink> outlinks = new ArrayList<Outlink>(); + for (int i = 0; i < urls.size(); i++) { + String rawUrl = urls.get(i); + try { + URL url = URLUtil.resolveURL(base, rawUrl); + outlinks.add(new Outlink(url.toString(), "")); + } catch (MalformedURLException e) { + LOG.debug("failed to resolve url (base: {}, path: {})", base, rawUrl); + } + } + + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", + outlinks.toArray(new Outlink[0]), content.getMetadata(), new Metadata()); + return ParseResult.createParseResult(content.getUrl(), new ParseImpl("", parseData)); + } + + private void collectStyleDeclarationOutlinks(CSSStyleDeclaration style, List<String> urls) { + for (int i = 0; i < style.getLength(); i++) { + String property = style.item(i); + CSSValue value = style.getPropertyCSSValue(property); + switch (value.getCssValueType()) { + case CSSValue.CSS_PRIMITIVE_VALUE: + collectPropertyPrimitiveValueOutlinks((CSSPrimitiveValue)value, urls); + break; + case CSSValue.CSS_VALUE_LIST: + collectPropertyValueListOutlinks((CSSValueList)value, urls); + break; + default: break; + } + } + } + + private void collectPropertyPrimitiveValueOutlinks(CSSPrimitiveValue value, List<String> urls) { + if (value.getPrimitiveType() == CSSPrimitiveValue.CSS_URI) { + String uri = value.getStringValue(); + // ignore "data" URIs (http://tools.ietf.org/html/rfc2397) + if (!uri.startsWith("data:")) { + urls.add(uri); + } + } + } + + private void collectPropertyValueListOutlinks(CSSValueList values, List<String> urls) { + for (int i = 0; i < values.getLength(); i++) { + CSSValue value = values.item(i); + + switch (value.getCssValueType()) { + case CSSValue.CSS_PRIMITIVE_VALUE: + collectPropertyPrimitiveValueOutlinks((CSSPrimitiveValue)value, urls); + break; + case CSSValue.CSS_VALUE_LIST: + // ignore nested value lists + break; + default: break; + } + } + + } + + @Override + public Configuration getConf() { + return this.config; + } + + @Override + public void setConf(Configuration conf) { + this.config = conf; + } + + public static void main(String[] args) throws Exception { + String name = args[0]; + String url = "file:" + name; + File file = new File(name); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + Configuration conf = NutchConfiguration.create(); + CssParser parser = new CssParser(); + parser.setConf(conf); + Parse parse = parser.getParse( + new Content(url, url, bytes, "text/css", new Metadata(), conf)).get( + url); + System.out.println("data: " + parse.getData()); + System.out.println("text: " + parse.getText()); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/60e5a0e6/src/plugin/parse-css/src/test/org/apache/nutch/parse/css/TestCssParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-css/src/test/org/apache/nutch/parse/css/TestCssParser.java b/src/plugin/parse-css/src/test/org/apache/nutch/parse/css/TestCssParser.java new file mode 100644 index 0000000..4875f1b --- /dev/null +++ b/src/plugin/parse-css/src/test/org/apache/nutch/parse/css/TestCssParser.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.css; + +import java.net.MalformedURLException; +import java.nio.charset.StandardCharsets; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.parse.*; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestCssParser { + public static final Logger LOG = LoggerFactory.getLogger(TestCssParser.class); + + private static final String fullCss = + "@import url(\"fineprint.css\") print;\n" + + "@import url(\"bluish.css\") projection, tv;\n" + + "@import 'custom.css';\n" + + "@import url(\"chrome://communicator/skin/\");\n" + + "@import \"common.css\" screen, projection;\n" + + "@import url('landscape.css') screen and (orientation:landscape);\n" + + "@font-face {\n" + + " font-family: \"Bitstream Vera Serif Bold\";\n" + + " src: url(\"https://mdn.mozillademos.org/files/2468/VeraSeBd.ttf\");\n" + + "}\n" + + "@font-face {\n" + + " font-family: MyHelvetica;\n" + + " src: local(\"Helvetica Neue Bold\"),\n" + + " local(\"HelveticaNeue-Bold\"),\n" + + " url(MgOpenModernaBold.ttf);\n" + + " font-weight: bold;\n" + + "}\n" + + "@font-face {\n" + + " font-family: MyHelvetica;\n" + + " src: url('Fixedys.ttf'),\n" + + " url(\"FixedysOpen.ttf\"),\n" + + " url(OpenFixedys.ttf);\n" + + " font-weight: bold;\n" + + "}\n" + + ".topbanner { background: url(\"topbanner.png\") #00D no-repeat fixed; }\n" + + ".footer { background: url(./resources/footer.png); }\n" + + "ul { list-style: square url(http://www.example.com/redball.png) }\n" + + "div#header {\n" + + " background-image: url('images/header-background.jpg');\n" + + "}\n" + + "html {\n" + + " cursor: url(../cursors/cursor1.cur) 2 2, url(../images/cursor2.png),\n" + + " url('../images/cursor3.gif') 4 12,default;\n" + + "}\n" + + "li {\n" + + " background:\n" + + " url(data:image/gif;base64,FEDCBA9876543210)\n" + + " no-repeat\n" + + " left center;\n" + + " padding: 5px 0 5px 25px;\n" + + "}"; + + private Configuration conf; + private Parser parser; + + public TestCssParser() { + conf = NutchConfiguration.create(); + parser = new CssParser(); + parser.setConf(conf); + } + + @Test + public void testLinkExtraction() throws MalformedURLException { + byte[] contentBytes = fullCss.getBytes(StandardCharsets.UTF_8); + String dummyBase = "http://dummy.url/"; + String dummyUrl = dummyBase + "style/dummy.css"; + Content content = new Content(dummyUrl, dummyUrl, contentBytes, "text/css", + new Metadata(), conf); + Parse parse = parser.getParse(content).get(dummyUrl); + Outlink[] parsedOutlinks = parse.getData().getOutlinks(); + + String anchor = ""; + Outlink[] expectedOutlinks = { + new Outlink(dummyBase + "style/fineprint.css", anchor), + new Outlink(dummyBase + "style/bluish.css", anchor), + new Outlink(dummyBase + "style/custom.css", anchor), + new Outlink(dummyBase + "style/common.css", anchor), + new Outlink(dummyBase + "style/landscape.css", anchor), + new Outlink("https://mdn.mozillademos.org/files/2468/VeraSeBd.ttf", anchor), + new Outlink(dummyBase + "style/MgOpenModernaBold.ttf", anchor), + new Outlink(dummyBase + "style/Fixedys.ttf", anchor), + new Outlink(dummyBase + "style/FixedysOpen.ttf", anchor), + new Outlink(dummyBase + "style/OpenFixedys.ttf", anchor), + new Outlink(dummyBase + "style/topbanner.png", anchor), + new Outlink(dummyBase + "style/resources/footer.png", anchor), + new Outlink("http://www.example.com/redball.png", anchor), + new Outlink(dummyBase + "style/images/header-background.jpg", anchor), + new Outlink(dummyBase + "cursors/cursor1.cur", anchor), + new Outlink(dummyBase + "images/cursor2.png", anchor), + new Outlink(dummyBase + "images/cursor3.gif", anchor) + }; + + Assert.assertArrayEquals("Parsed Outlinks do not match expected", expectedOutlinks, parsedOutlinks); + } +}
