CCParseFilter.java

lewismc Thu, 20 Jun 2013 13:17:02 -0700

Author: lewismc
Date: Thu Jun 20 20:16:10 2013
New Revision: 1495159

URL: http://svn.apache.org/r1495159
Log:
set addition of CCParseFilter license headers to debug


Modified:
    
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java

Modified: 
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1495159&r1=1495158&r2=1495159&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
 Thu Jun 20 20:16:10 2013
@@ -50,282 +50,264 @@ import org.xml.sax.InputSource;
 
 /** Adds metadata identifying the Creative Commons license used, if any. */
 public class CCParseFilter implements ParseFilter {
-       public static final Logger LOG = 
LoggerFactory.getLogger(CCParseFilter.class);
+  public static final Logger LOG = 
LoggerFactory.getLogger(CCParseFilter.class);
 
-       /** Walks DOM tree, looking for RDF in comments and licenses in 
anchors. */
-       public static class Walker {
-               private URL base; // base url of page
-               private String rdfLicense; // subject url found, if any
-               private URL relLicense; // license url found, if any
-               private URL anchorLicense; // anchor url found, if any
-               private String workType; // work type URI
-
-               private Walker(URL base) {
-                       this.base = base;
-               }
-
-               /** Scan the document adding attributes to metadata. */
-               public static void walk(Node doc, URL base, WebPage page,
-                               Configuration conf) throws ParseException {
-
-                       // walk the DOM tree, scanning for license data
-                       Walker walker = new Walker(base);
-                       walker.walk(doc);
-
-                       // interpret results of walk
-                       String licenseUrl = null;
-                       String licenseLocation = null;
-                       if (walker.rdfLicense != null) { // 1st choice: subject 
in RDF
-                               licenseLocation = "rdf";
-                               licenseUrl = walker.rdfLicense;
-                       } else if (walker.relLicense != null) { // 2nd: anchor 
w/
-                                                                               
                        // rel=license
-                               licenseLocation = "rel";
-                               licenseUrl = walker.relLicense.toString();
-                       } else if (walker.anchorLicense != null) { // 3rd: 
anchor w/ CC
-                                                                               
                                // license
-                               licenseLocation = "a";
-                               licenseUrl = walker.anchorLicense.toString();
-                       } else if 
(conf.getBoolean("creativecommons.exclude.unlicensed",
-                                       false)) {
-                               throw new ParseException("No CC license.  
Excluding.");
-                       }
-
-                       // add license to metadata
-                       if (licenseUrl != null) {
-                               if (LOG.isInfoEnabled()) {
-                                       LOG.info("CC: found " + licenseUrl + " 
in "
-                                                       + licenseLocation + " 
of " + base);
-                               }
-                               page.putToMetadata(new 
Utf8(CreativeCommons.LICENSE_URL),
-                                               
ByteBuffer.wrap(licenseUrl.getBytes()));
-                               page.putToMetadata(new 
Utf8(CreativeCommons.LICENSE_LOCATION),
-                                               
ByteBuffer.wrap(licenseLocation.getBytes()));
-                       }
-
-                       if (walker.workType != null) {
-                               if (LOG.isInfoEnabled()) {
-                                       LOG.info("CC: found " + walker.workType 
+ " in " + base);
-                               }
-                               page.putToMetadata(new 
Utf8(CreativeCommons.WORK_TYPE),
-                                               
ByteBuffer.wrap(walker.workType.getBytes()));
-                       }
-
-               }
-
-               /** Scan the document looking for RDF in comments and license 
elements. */
-               private void walk(Node node) {
-
-                       // check element nodes for license URL
-                       if (node instanceof Element) {
-                               findLicenseUrl((Element) node);
-                       }
-
-                       // check comment nodes for license RDF
-                       if (node instanceof Comment) {
-                               findRdf(((Comment) node).getData());
-                       }
-
-                       // recursively walk child nodes
-                       NodeList children = node.getChildNodes();
-                       for (int i = 0; children != null && i < 
children.getLength(); i++) {
-                               walk(children.item(i));
-                       }
-               }
-
-               /**
-                * Extract license url from element, if any. Thse are the href 
attribute
-                * of anchor elements with rel="license". These must also point 
to
-                * http://creativecommons.org/licenses/.
-                */
-               private void findLicenseUrl(Element element) {
-                       // only look in Anchor elements
-                       if (!"a".equalsIgnoreCase(element.getTagName()))
-                               return;
-
-                       // require an href
-                       String href = element.getAttribute("href");
-                       if (href == null)
-                               return;
-
-                       try {
-                               URL url = new URL(base, href); // resolve the 
url
-
-                               // check that it's a CC license URL
-                               if ("http".equalsIgnoreCase(url.getProtocol())
-                                               && "creativecommons.org"
-                                                               
.equalsIgnoreCase(url.getHost())
-                                               && url.getPath() != null
-                                               && 
url.getPath().startsWith("/licenses/")
-                                               && url.getPath().length() > 
"/licenses/".length()) {
-
-                                       // check rel="license"
-                                       String rel = 
element.getAttribute("rel");
-                                       if (rel != null && "license".equals(rel)
-                                                       && this.relLicense == 
null) {
-                                               this.relLicense = url; // found 
rel license
-                                       } else if (this.anchorLicense == null) {
-                                               this.anchorLicense = url; // 
found anchor license
-                                       }
-                               }
-                       } catch (MalformedURLException e) { // ignore malformed 
urls
-                       }
-               }
-
-               /** Configure a namespace aware XML parser. */
-               private static final DocumentBuilderFactory FACTORY = 
DocumentBuilderFactory
-                               .newInstance();
-               static {
-                       FACTORY.setNamespaceAware(true);
-               }
-
-               /** Creative Commons' namespace URI. */
-               private static final String CC_NS = 
"http://web.resource.org/cc/";;
-
-               /** Dublin Core namespace URI. */
-               private static final String DC_NS = 
"http://purl.org/dc/elements/1.1/";;
-
-               /** RDF syntax namespace URI. */
-               private static final String RDF_NS = 
"http://www.w3.org/1999/02/22-rdf-syntax-ns#";;
-
-               private void findRdf(String comment) {
-                       // first check for likely RDF in comment
-                       int rdfPosition = comment.indexOf("RDF");
-                       if (rdfPosition < 0)
-                               return; // no RDF, abort
-                       int nsPosition = comment.indexOf(CC_NS);
-                       if (nsPosition < 0)
-                               return; // no RDF, abort
-
-                       // try to parse the XML
-                       Document doc;
-                       try {
-                               DocumentBuilder parser = 
FACTORY.newDocumentBuilder();
-                               doc = parser.parse(new InputSource(new 
StringReader(comment)));
-                       } catch (Exception e) {
-                               if (LOG.isWarnEnabled()) {
-                                       LOG.warn("CC: Failed to parse RDF in " 
+ base + ": " + e);
-                               }
-                               // e.printStackTrace();
-                               return;
-                       }
-
-                       // check that root is rdf:RDF
-                       NodeList roots = doc.getElementsByTagNameNS(RDF_NS, 
"RDF");
-                       if (roots.getLength() != 1) {
-                               if (LOG.isWarnEnabled()) {
-                                       LOG.warn("CC: No RDF root in " + base);
-                               }
-                               return;
-                       }
-                       Element rdf = (Element) roots.item(0);
-
-                       // get cc:License nodes inside rdf:RDF
-                       NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, 
"License");
-                       for (int i = 0; i < licenses.getLength(); i++) {
-
-                               Element l = (Element) licenses.item(i);
-
-                               // license is rdf:about= attribute from 
cc:License
-                               this.rdfLicense = l.getAttributeNodeNS(RDF_NS, 
"about")
-                                               .getValue();
-
-                               // walk predicates of cc:License
-                               NodeList predicates = l.getChildNodes();
-                               for (int j = 0; j < predicates.getLength(); 
j++) {
-                                       Node predicateNode = predicates.item(j);
-                                       if (!(predicateNode instanceof Element))
-                                               continue;
-                                       Element predicateElement = (Element) 
predicateNode;
-
-                                       // extract predicates of cc:xxx 
predicates
-                                       if 
(!CC_NS.equals(predicateElement.getNamespaceURI())) {
-                                               continue;
-                                       }
-                                       String predicate = 
predicateElement.getLocalName();
-
-                                       // object is rdf:resource from cc:xxx 
predicates
-                                       String object = 
predicateElement.getAttributeNodeNS(RDF_NS,
-                                                       "resource").getValue();
-
-                                       // add object and predicate to metadata
-                                       // metadata.put(object, predicate);
-                                       // if (LOG.isInfoEnabled()) {
-                                       // LOG.info("CC: found: 
"+predicate+"="+object);
-                                       // }
-                               }
-                       }
-
-                       // get cc:Work nodes from rdf:RDF
-                       NodeList works = rdf.getElementsByTagNameNS(CC_NS, 
"Work");
-                       for (int i = 0; i < works.getLength(); i++) {
-                               Element l = (Element) works.item(i);
-
-                               // get dc:type nodes from cc:Work
-                               NodeList types = 
rdf.getElementsByTagNameNS(DC_NS, "type");
-                               for (int j = 0; j < types.getLength(); j++) {
-                                       Element type = (Element) types.item(j);
-                                       String workUri = type
-                                                       
.getAttributeNodeNS(RDF_NS, "resource").getValue();
-                                       this.workType = (String) 
WORK_TYPE_NAMES.get(workUri);
-                                       break;
-                               }
-                       }
-               }
+  /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
+  public static class Walker {
+    private URL base; // base url of page
+    private String rdfLicense; // subject url found, if any
+    private URL relLicense; // license url found, if any
+    private URL anchorLicense; // anchor url found, if any
+    private String workType; // work type URI
+
+    private Walker(URL base) {
+      this.base = base;
+    }
+
+    /** Scan the document adding attributes to metadata. */
+    public static void walk(Node doc, URL base, WebPage page,
+        Configuration conf) throws ParseException {
+
+      // walk the DOM tree, scanning for license data
+      Walker walker = new Walker(base);
+      walker.walk(doc);
+
+      // interpret results of walk
+      String licenseUrl = null;
+      String licenseLocation = null;
+      if (walker.rdfLicense != null) { // 1st choice: subject in RDF
+        licenseLocation = "rdf";
+       licenseUrl = walker.rdfLicense;
+      } else if (walker.relLicense != null) { // 2nd: anchor w/
+        // rel=license
+        licenseLocation = "rel";
+        licenseUrl = walker.relLicense.toString();
+      } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC
+        // license
+       licenseLocation = "a";
+       licenseUrl = walker.anchorLicense.toString();
+      } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) 
{
+          throw new ParseException("No CC license.  Excluding.");
+      }
+
+      // add license to metadata
+      if (licenseUrl != null) {
+        if (LOG.isDebugEnabled()) {
+         LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation + " of 
" + base);
        }
-
-       private static final Collection<WebPage.Field> FIELDS = new 
HashSet<WebPage.Field>();
-
-       static {
-               FIELDS.add(WebPage.Field.BASE_URL);
-               FIELDS.add(WebPage.Field.METADATA);
+       page.putToMetadata(new Utf8(CreativeCommons.LICENSE_URL),
+       ByteBuffer.wrap(licenseUrl.getBytes()));
+       page.putToMetadata(new Utf8(CreativeCommons.LICENSE_LOCATION),
+           ByteBuffer.wrap(licenseLocation.getBytes()));
+      }
+
+      if (walker.workType != null) {
+        if (LOG.isDebugEnabled()) {
+         LOG.debug("CC: found " + walker.workType + " in " + base);
        }
-
-       private static final HashMap<String,String> WORK_TYPE_NAMES = new 
HashMap<String,String>();
-       static {
-               WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage";, 
"video");
-               WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage";, 
"image");
-               WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound";, 
"audio");
-               WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text";, "text");
-               WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive";,
-                               "interactive");
-               WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software";, 
"software");
-               WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image";, 
"image");
+       page.putToMetadata(new Utf8(CreativeCommons.WORK_TYPE),
+          ByteBuffer.wrap(walker.workType.getBytes()));
+      }
+
+    }
+
+    /** Scan the document looking for RDF in comments and license elements. */
+    private void walk(Node node) {
+      // check element nodes for license URL
+      if (node instanceof Element) {
+        findLicenseUrl((Element) node);
+      }
+
+      // check comment nodes for license RDF
+      if (node instanceof Comment) {
+        findRdf(((Comment) node).getData());
+      }
+
+      // recursively walk child nodes
+      NodeList children = node.getChildNodes();
+      for (int i = 0; children != null && i < children.getLength(); i++) {
+        walk(children.item(i));
+      }
+    }
+
+    /**
+     * Extract license url from element, if any. Thse are the href attribute
+     * of anchor elements with rel="license". These must also point to
+     * http://creativecommons.org/licenses/.
+     */
+    private void findLicenseUrl(Element element) {
+      // only look in Anchor elements
+      if (!"a".equalsIgnoreCase(element.getTagName()))
+        return;
+
+      // require an href
+      String href = element.getAttribute("href");
+      if (href == null)
+        return;
+      try {
+        URL url = new URL(base, href); // resolve the url
+        // check that it's a CC license URL
+       if ("http".equalsIgnoreCase(url.getProtocol())
+           && "creativecommons.org".equalsIgnoreCase(url.getHost())
+           && url.getPath() != null && url.getPath().startsWith("/licenses/")
+           && url.getPath().length() > "/licenses/".length()) {
+
+         // check rel="license"
+         String rel = element.getAttribute("rel");
+         if (rel != null && "license".equals(rel)
+             && this.relLicense == null) {
+           this.relLicense = url; // found rel license
+         } else if (this.anchorLicense == null) {
+           this.anchorLicense = url; // found anchor license
+         }
+       }
+      } catch (MalformedURLException e) { // ignore malformed urls
+      }
+    }
+
+    /** Configure a namespace aware XML parser. */
+    private static final DocumentBuilderFactory FACTORY = 
DocumentBuilderFactory
+        .newInstance();
+      static {
+        FACTORY.setNamespaceAware(true);
+      }
+
+      /** Creative Commons' namespace URI. */
+      private static final String CC_NS = "http://web.resource.org/cc/";;
+
+      /** Dublin Core namespace URI. */
+      private static final String DC_NS = "http://purl.org/dc/elements/1.1/";;
+
+      /** RDF syntax namespace URI. */
+      private static final String RDF_NS = 
"http://www.w3.org/1999/02/22-rdf-syntax-ns#";;
+
+      private void findRdf(String comment) {
+        // first check for likely RDF in comment
+       int rdfPosition = comment.indexOf("RDF");
+       if (rdfPosition < 0)
+         return; // no RDF, abort
+       int nsPosition = comment.indexOf(CC_NS);
+       if (nsPosition < 0)
+         return; // no RDF, abort
+       // try to parse the XML
+       Document doc;
+       try {
+          DocumentBuilder parser = FACTORY.newDocumentBuilder();
+         doc = parser.parse(new InputSource(new StringReader(comment)));
+       } catch (Exception e) {
+         if (LOG.isWarnEnabled()) {
+           LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
+         }
+         // e.printStackTrace();
+         return;
        }
 
-       private Configuration conf;
+       // check that root is rdf:RDF
+       NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
+       if (roots.getLength() != 1) {
+         if (LOG.isWarnEnabled()) {
+           LOG.warn("CC: No RDF root in " + base);
+         }
+         return;
+       }
+       Element rdf = (Element) roots.item(0);
 
-       public void setConf(Configuration conf) {
-               this.conf = conf;
+       // get cc:License nodes inside rdf:RDF
+       NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
+       for (int i = 0; i < licenses.getLength(); i++) {
+          Element l = (Element) licenses.item(i);
+         // license is rdf:about= attribute from cc:License
+         this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
+
+          // walk predicates of cc:License
+         NodeList predicates = l.getChildNodes();
+         for (int j = 0; j < predicates.getLength(); j++) {
+           Node predicateNode = predicates.item(j);
+           if (!(predicateNode instanceof Element))
+             continue;
+             Element predicateElement = (Element) predicateNode;
+              // extract predicates of cc:xxx predicates
+             if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
+               continue;
+             }
+             String predicate = predicateElement.getLocalName();
+              // object is rdf:resource from cc:xxx predicates
+             String object = predicateElement.getAttributeNodeNS(RDF_NS, 
"resource").getValue();
+              // add object and predicate to metadata
+             // metadata.put(object, predicate);
+             //if (LOG.isInfoEnabled()) {
+             // LOG.info("CC: found: "+predicate+"="+object);
+             // }
+         }
        }
 
-       public Configuration getConf() {
-               return this.conf;
+       // get cc:Work nodes from rdf:RDF
+       NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
+       for (int i = 0; i < works.getLength(); i++) {
+         Element l = (Element) works.item(i);
+
+         // get dc:type nodes from cc:Work
+         NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
+         for (int j = 0; j < types.getLength(); j++) {
+           Element type = (Element) types.item(j);
+           String workUri = type.getAttributeNodeNS(RDF_NS, 
"resource").getValue();
+           this.workType = (String) WORK_TYPE_NAMES.get(workUri);
+           break;
+         }
        }
+      }
+    }
 
-       @Override
-       public Collection<Field> getFields() {
-               return FIELDS;
+    private static final Collection<WebPage.Field> FIELDS = new 
HashSet<WebPage.Field>();
+      static {
+        FIELDS.add(WebPage.Field.BASE_URL);
+       FIELDS.add(WebPage.Field.METADATA);
+      }
+
+      private static final HashMap<String,String> WORK_TYPE_NAMES = new 
HashMap<String,String>();
+        static {
+         WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage";, 
"video");
+         WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage";, 
"image");
+         WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound";, "audio");
+         WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text";, "text");
+         WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive";, 
"interactive");
+         WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software";, 
"software");
+         WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image";, "image");
        }
 
-       /**
-        * Adds metadata or otherwise modifies a parse of an HTML document, 
given
-        * the DOM tree of a page.
-        */
-       @Override
-       public Parse filter(String url, WebPage page, Parse parse,
-                       HTMLMetaTags metaTags, DocumentFragment doc) {
-               // construct base url
-               URL base;
-               try {
-                       base = new URL(page.getBaseUrl().toString());
-                       // extract license metadata
-                       Walker.walk(doc, base, page, getConf());
-               } catch (Exception e) {
-                       LOG.error("Error parsing " + url, e);
-                       return ParseStatusUtils.getEmptyParse(e, getConf());
-               }
+      private Configuration conf;
 
-               return parse;
+      public void setConf(Configuration conf) {
+        this.conf = conf;
+      }
+
+      public Configuration getConf() {
+        return this.conf;
+      }
+
+      @Override
+      public Collection<Field> getFields() {
+        return FIELDS;
+      }
+
+      /**
+       * Adds metadata or otherwise modifies a parse of an HTML document, given
+       * the DOM tree of a page.
+       */
+      @Override
+      public Parse filter(String url, WebPage page, Parse parse,
+          HTMLMetaTags metaTags, DocumentFragment doc) {
+        // construct base url
+       URL base;
+       try {
+         base = new URL(page.getBaseUrl().toString());
+         // extract license metadata
+         Walker.walk(doc, base, page, getConf());
+       } catch (Exception e) {
+         LOG.error("Error parsing " + url, e);
+         return ParseStatusUtils.getEmptyParse(e, getConf());
        }
+
+       return parse;
+      }
 }

svn commit: r1495159 - /nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java

Reply via email to