[
https://issues.apache.org/jira/browse/NUTCH-2248?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15235504#comment-15235504
]
ASF GitHub Bot commented on NUTCH-2248:
---------------------------------------
Github user lewismc commented on a diff in the pull request:
https://github.com/apache/nutch/pull/102#discussion_r59242509
--- Diff:
src/plugin/parse-css/src/java/org/apache/nutch/parse/css/CssParser.java ---
@@ -0,0 +1,225 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.css;
+
+import java.io.*;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.steadystate.css.parser.CSSOMParser;
+import com.steadystate.css.parser.SACParserCSS3;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.util.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.css.sac.CSSParseException;
+import org.w3c.css.sac.InputSource;
+import org.w3c.dom.css.*;
+
+
+public class CssParser implements org.apache.nutch.parse.Parser {
+ public static final Logger LOG =
LoggerFactory.getLogger(CssParser.class);
+
+ /**
+ * Suppresses warnings, logs all errors, and throws fatal errors
+ */
+ private class ErrorHandler implements org.w3c.css.sac.ErrorHandler {
+ @Override
+ public void warning(CSSParseException exception) { }
+
+ @Override
+ public void error(CSSParseException exception) {
+ LOG.debug("CSS parser error: " + exception.getMessage());
+ }
+
+ @Override
+ public void fatalError(CSSParseException exception) {
+ error(exception);
+ throw exception;
+ }
+ }
+
+ private Configuration config;
+
+ /** Parses CSS stylesheets for outlinks.
+ *
+ * Extracts :
+ *
+ * - \\@import url(...) ...;
+ * - \\@import '...';
+ * - \\@font-face src: <uri>
+ * - and all non-custom style property values matching url(...)
+ *
+ * Ignores:
+ *
+ * - \\@namespace <uri>
+ * - \\@document <url>
+ *
+ * @param content CSS resource content to be parsed
+ * @return result of parse
+ */
+ @Override
+ public ParseResult getParse(Content content) {
+ CSSOMParser parser;
+ parser = new CSSOMParser(new SACParserCSS3());
+ parser.setErrorHandler(new ErrorHandler());
+
+ URL base;
+ try {
+ base = new URL(content.getBaseUrl());
+ } catch (MalformedURLException e) {
+ return new ParseStatus(e)
+ .getEmptyParseResult(content.getUrl(), getConf());
+ }
+
+ ByteArrayInputStream bis = new
ByteArrayInputStream(content.getContent());
+ Reader reader = new InputStreamReader(bis, StandardCharsets.UTF_8);
+ InputSource source = new InputSource(reader);
+ CSSStyleSheet sheet;
+ try {
+ sheet = parser.parseStyleSheet(source, null, content.getBaseUrl());
+ } catch (IOException e) {
+ return new ParseStatus(e)
+ .getEmptyParseResult(content.getUrl(), getConf());
+ } finally {
+ try {
+ reader.close();
+ } catch (IOException e) {
+ LOG.warn("failed to close reader");
+ }
+ }
+
+ CSSRuleList rules = sheet.getCssRules();
+ List<String> urls = new ArrayList<String>();
+ for (int i = 0; i < rules.getLength(); i++) {
+ CSSRule rule = rules.item(i);
+ switch (rule.getType()) {
+ // @import
+ case CSSRule.IMPORT_RULE:
+ urls.add(((CSSImportRule) rule).getHref());
+ break;
+
+ // @font-face
+ case CSSRule.FONT_FACE_RULE:
+
collectStyleDeclarationOutlinks(((CSSFontFaceRule)rule).getStyle(), urls);
+ break;
+
+ // normal CSS style rule
+ case CSSRule.STYLE_RULE:
+ collectStyleDeclarationOutlinks(((CSSStyleRule)rule).getStyle(),
urls);
+ break;
+
+ // ignore @charset, @media, @page and unknown at-rules
+ default:
+ break;
+ }
+ }
+
+ // resolve each relative URL to create a list of Outlinks
+ List<Outlink> outlinks = new ArrayList<Outlink>();
+ for (int i = 0; i < urls.size(); i++) {
+ String rawUrl = urls.get(i);
+ try {
+ URL url = URLUtil.resolveURL(base, rawUrl);
+ outlinks.add(new Outlink(url.toString(), ""));
+ } catch (MalformedURLException e) {
+ LOG.debug("failed to resolve url (base: " + base + ", path: " +
rawUrl + ")");
--- End diff --
Can you use ```LOG.debug("failed to resolve url (base: {}, path: {})",
base, rawUrl); ```
> CSS parser plugin
> -----------------
>
> Key: NUTCH-2248
> URL: https://issues.apache.org/jira/browse/NUTCH-2248
> Project: Nutch
> Issue Type: New Feature
> Components: parser, plugin
> Affects Versions: 1.12
> Reporter: Joseph Naegele
>
> This plugin allows for collecting {{uri}} links from CSS (stylesheets). This
> is useful for collecting parent stylesheets, fonts, and images needed to
> display web pages as intended.
> Parsed Outlinks do not have associated anchors, and no additional
> text/content is parsed from the stylesheet.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)