RelTagParser.java

lewismc Tue, 18 Jun 2013 20:20:08 -0700

Author: lewismc
Date: Wed Jun 19 03:19:26 2013
New Revision: 1494428

URL: http://svn.apache.org/r1494428
Log:
format microformat-reltag and change tag logging to debug


Modified:
    
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java

Modified: 
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1494428&r1=1494427&r2=1494428&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 Wed Jun 19 03:19:26 2013
@@ -62,56 +62,55 @@ public class RelTagParser implements Par
     Set<String> tags = null;
 
     Parser(Node node) {
-         tags = new TreeSet<String>();
-         parse(node);
+      tags = new TreeSet<String>();
+      parse(node);
     }
 
     Set<String> getRelTags() {
-         return tags;
+      return tags;
     }
 
     void parse(Node node) {
-
       if (node.getNodeType() == Node.ELEMENT_NODE) {
-           // Look for <a> tag
-           if ("a".equalsIgnoreCase(node.getNodeName())) {
-                 NamedNodeMap attrs = node.getAttributes();
-                 Node hrefNode = attrs.getNamedItem("href");
-                 // Checks that it contains a href attribute
-                 if (hrefNode != null) {
-                   Node relNode = attrs.getNamedItem("rel");
-                   // Checks that it contains a rel attribute too
-                   if (relNode != null) {
-                     // Finaly checks that rel=tag
-                         if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
-                           String tag = parseTag(hrefNode.getNodeValue());
-                           if (!StringUtil.isEmpty(tag)) {
-                                 tags.add(tag);
-                                 LOG.info("Adding tag: " + tag + " to tag 
set.");
-                           }
-                         }
-                   }
-                 }
+        // Look for <a> tag
+        if ("a".equalsIgnoreCase(node.getNodeName())) {
+         NamedNodeMap attrs = node.getAttributes();
+         Node hrefNode = attrs.getNamedItem("href");
+         // Checks that it contains a href attribute
+         if (hrefNode != null) {
+           Node relNode = attrs.getNamedItem("rel");
+           // Checks that it contains a rel attribute too
+           if (relNode != null) {
+             // Finaly checks that rel=tag
+             if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+               String tag = parseTag(hrefNode.getNodeValue());
+               if (!StringUtil.isEmpty(tag)) {
+                 tags.add(tag);
+                 LOG.debug("Adding tag: " + tag + " to tag set.");
+               }
+             }
            }
+         }
+       }
       }
 
-         // Recurse
-         NodeList children = node.getChildNodes();
-         for (int i = 0; children != null && i < children.getLength(); i++) {
-           parse(children.item(i));
-         }
+      // Recurse
+      NodeList children = node.getChildNodes();
+      for (int i = 0; children != null && i < children.getLength(); i++) {
+        parse(children.item(i));
+      }
     }
 
     private final static String parseTag(String url) {
-         String tag = null;
-         try {
-           URL u = new URL(url);
-           String path = u.getPath();
-           tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), 
"UTF-8");
-         } catch (Exception e) {
-           // Malformed tag...
-           tag = null;
-         } return tag;
+      String tag = null;
+      try {
+        URL u = new URL(url);
+        String path = u.getPath();
+        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), 
"UTF-8");
+      } catch (Exception e) {
+        // Malformed tag...
+        tag = null;
+      } return tag;
     }
   }
 
@@ -119,21 +118,21 @@ public class RelTagParser implements Par
    * Set the {@link Configuration} object
    */
   public void setConf(Configuration conf) {
-       this.conf = conf;
+    this.conf = conf;
   }
 
   /**
    * Get the {@link Configuration} object
    */
   public Configuration getConf() {
-       return this.conf;
+    return this.conf;
   }
 
   private static final Collection<WebPage.Field> FIELDS = new 
HashSet<WebPage.Field>();
 
   static {
-       FIELDS.add(WebPage.Field.BASE_URL);
-       FIELDS.add(WebPage.Field.METADATA);
+    FIELDS.add(WebPage.Field.BASE_URL);
+    FIELDS.add(WebPage.Field.METADATA);
   }
   
   /**
@@ -144,7 +143,7 @@ public class RelTagParser implements Par
    */
   @Override
   public Collection<Field> getFields() {
-       return FIELDS;
+    return FIELDS;
   }
 
   @Override
@@ -158,19 +157,19 @@ public class RelTagParser implements Par
    * @return parse the actual {@link Parse} object
    */
   public Parse filter(String url, WebPage page, Parse parse,
-    HTMLMetaTags metaTags, DocumentFragment doc) {
-       // Trying to find the document's rel-tags
-       Parser parser = new Parser(doc);
-       Set<String> tags = parser.getRelTags();
-       // can't store multiple values in page metadata -> separate by tabs
-       StringBuffer sb = new StringBuffer();
-       Iterator<String> iter = tags.iterator();
-       while (iter.hasNext()) {
-         sb.append(iter.next());
-         sb.append("\t");
-       }
-       ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
-       page.putToMetadata(new Utf8(REL_TAG), bb);
-       return parse;
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+    // Trying to find the document's rel-tags
+    Parser parser = new Parser(doc);
+    Set<String> tags = parser.getRelTags();
+    // can't store multiple values in page metadata -> separate by tabs
+    StringBuffer sb = new StringBuffer();
+    Iterator<String> iter = tags.iterator();
+    while (iter.hasNext()) {
+      sb.append(iter.next());
+      sb.append("\t");
+    }
+    ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
+    page.putToMetadata(new Utf8(REL_TAG), bb);
+    return parse;
   }
 }

svn commit: r1494428 - /nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java

Reply via email to