pr...

lewismc Wed, 28 Jan 2015 21:40:04 -0800

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Jan 
29 05:38:59 2015
@@ -47,7 +47,8 @@ import org.apache.hadoop.util.Progressab
 
 /* Parse content in a segment. */
 public class ParseOutputFormat implements OutputFormat<Text, Parse> {
-  private static final Logger LOG = 
LoggerFactory.getLogger(ParseOutputFormat.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ParseOutputFormat.class);
 
   private URLFilters filters;
   private URLNormalizers normalizers;
@@ -56,16 +57,16 @@ public class ParseOutputFormat implement
   private static class SimpleEntry implements Entry<Text, CrawlDatum> {
     private Text key;
     private CrawlDatum value;
-    
+
     public SimpleEntry(Text key, CrawlDatum value) {
       this.key = key;
       this.value = value;
     }
-    
+
     public Text getKey() {
       return key;
     }
-    
+
     public CrawlDatum getValue() {
       return value;
     }
@@ -77,93 +78,92 @@ public class ParseOutputFormat implement
   }
 
   public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
-      Path out = FileOutputFormat.getOutputPath(job);
-      if ((out == null) && (job.getNumReduceTasks() != 0)) {
-          throw new InvalidJobConfException(
-                  "Output directory not set in JobConf.");
-      }
-      if (fs == null) {
-          fs = out.getFileSystem(job);
-      }
-      if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME)))
-          throw new IOException("Segment already parsed!");
+    Path out = FileOutputFormat.getOutputPath(job);
+    if ((out == null) && (job.getNumReduceTasks() != 0)) {
+      throw new InvalidJobConfException("Output directory not set in 
JobConf.");
+    }
+    if (fs == null) {
+      fs = out.getFileSystem(job);
+    }
+    if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME)))
+      throw new IOException("Segment already parsed!");
   }
 
   public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job,
-                                      String name, Progressable progress) 
throws IOException {
+      String name, Progressable progress) throws IOException {
 
-    if(job.getBoolean("parse.filter.urls", true)) {
+    if (job.getBoolean("parse.filter.urls", true)) {
       filters = new URLFilters(job);
     }
 
-    if(job.getBoolean("parse.normalize.urls", true)) {
+    if (job.getBoolean("parse.normalize.urls", true)) {
       normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
     }
 
     this.scfilters = new ScoringFilters(job);
     final int interval = job.getInt("db.fetch.interval.default", 2592000);
-    final boolean ignoreExternalLinks = 
job.getBoolean("db.ignore.external.links", false);
+    final boolean ignoreExternalLinks = job.getBoolean(
+        "db.ignore.external.links", false);
     int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
     final boolean isParsing = job.getBoolean("fetcher.parse", true);
     final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
-                                                     : maxOutlinksPerPage;
-    final CompressionType compType = 
SequenceFileOutputFormat.getOutputCompressionType(job);
+        : maxOutlinksPerPage;
+    final CompressionType compType = SequenceFileOutputFormat
+        .getOutputCompressionType(job);
     Path out = FileOutputFormat.getOutputPath(job);
-    
+
     Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
     Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
     Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
-    
-    final String[] parseMDtoCrawlDB = 
job.get("db.parsemeta.to.crawldb","").split(" *, *");
-    
-    final MapFile.Writer textOut =
-      new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class,
-          CompressionType.RECORD, progress);
-    
-    final MapFile.Writer dataOut =
-      new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class,
-          compType, progress);
-    
-    final SequenceFile.Writer crawlOut =
-      SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class,
-          compType, progress);
-    
+
+    final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "")
+        .split(" *, *");
+
+    final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(),
+        Text.class, ParseText.class, CompressionType.RECORD, progress);
+
+    final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(),
+        Text.class, ParseData.class, compType, progress);
+
+    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job,
+        crawl, Text.class, CrawlDatum.class, compType, progress);
+
     return new RecordWriter<Text, Parse>() {
 
+      public void write(Text key, Parse parse) throws IOException {
 
-        public void write(Text key, Parse parse)
-          throws IOException {
-          
-          String fromUrl = key.toString();
-          String fromHost = null; 
-          textOut.append(key, new ParseText(parse.getText()));
-          
-          ParseData parseData = parse.getData();
-          // recover the signature prepared by Fetcher or ParseSegment
-          String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
-          if (sig != null) {
-            byte[] signature = StringUtil.fromHexString(sig);
-            if (signature != null) {
-              // append a CrawlDatum with a signature
-              CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
-              d.setSignature(signature);
-              crawlOut.append(key, d);
-            }
+        String fromUrl = key.toString();
+        String fromHost = null;
+        textOut.append(key, new ParseText(parse.getText()));
+
+        ParseData parseData = parse.getData();
+        // recover the signature prepared by Fetcher or ParseSegment
+        String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
+        if (sig != null) {
+          byte[] signature = StringUtil.fromHexString(sig);
+          if (signature != null) {
+            // append a CrawlDatum with a signature
+            CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
+            d.setSignature(signature);
+            crawlOut.append(key, d);
           }
-          
+        }
+
         // see if the parse metadata contain things that we'd like
         // to pass to the metadata of the crawlDB entry
         CrawlDatum parseMDCrawlDatum = null;
         for (String mdname : parseMDtoCrawlDB) {
           String mdvalue = parse.getData().getParseMeta().get(mdname);
           if (mdvalue != null) {
-            if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum(
-                CrawlDatum.STATUS_PARSE_META, 0);
+            if (parseMDCrawlDatum == null)
+              parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META,
+                  0);
             parseMDCrawlDatum.getMetaData().put(new Text(mdname),
                 new Text(mdvalue));
           }
         }
-        if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum);
+        if (parseMDCrawlDatum != null)
+          crawlOut.append(key, parseMDCrawlDatum);
 
         if (ignoreExternalLinks) {
           // need to determine fromHost (once for all outlinks)
@@ -198,91 +198,96 @@ public class ParseOutputFormat implement
           }
         }
 
-          // collect outlinks for subsequent db update
-          Outlink[] links = parseData.getOutlinks();
-          int outlinksToStore = Math.min(maxOutlinks, links.length);
-
-          int validCount = 0;
-          CrawlDatum adjust = null;
-          List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, 
CrawlDatum>>(outlinksToStore);
-          List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
-          for (int i = 0; i < links.length && validCount < outlinksToStore; 
i++) {
-            String toUrl = links[i].getToUrl();
-
-            // Only normalize and filter if fetcher.parse = false
-            if (!isParsing) {
-              toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, 
fromHost, ignoreExternalLinks, filters, normalizers);
-              if (toUrl == null) {
-                continue;
-              }
-            }
-
-            CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, 
interval);
-            Text targetUrl = new Text(toUrl);
-            
-            // see if the outlink has any metadata attached 
-            // and if so pass that to the crawldatum so that 
-            // the initial score or distribution can use that 
-            MapWritable outlinkMD = links[i].getMetadata();
-            if (outlinkMD!=null){
-               target.getMetaData().putAll(outlinkMD);
-            }
-            
-            try {
-              scfilters.initialScore(targetUrl, target);
-            } catch (ScoringFilterException e) {
-              LOG.warn("Cannot filter init score for url " + key +
-                      ", using default: " + e.getMessage());
-              target.setScore(0.0f);
+        // collect outlinks for subsequent db update
+        Outlink[] links = parseData.getOutlinks();
+        int outlinksToStore = Math.min(maxOutlinks, links.length);
+
+        int validCount = 0;
+        CrawlDatum adjust = null;
+        List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, 
CrawlDatum>>(
+            outlinksToStore);
+        List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
+        for (int i = 0; i < links.length && validCount < outlinksToStore; i++) 
{
+          String toUrl = links[i].getToUrl();
+
+          // Only normalize and filter if fetcher.parse = false
+          if (!isParsing) {
+            toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost,
+                ignoreExternalLinks, filters, normalizers);
+            if (toUrl == null) {
+              continue;
             }
+          }
 
-            targets.add(new SimpleEntry(targetUrl, target));
+          CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, 
interval);
+          Text targetUrl = new Text(toUrl);
 
-            // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
-            links[i].setUrl(toUrl);
-            outlinkList.add(links[i]);
-            validCount++;
+          // see if the outlink has any metadata attached
+          // and if so pass that to the crawldatum so that
+          // the initial score or distribution can use that
+          MapWritable outlinkMD = links[i].getMetadata();
+          if (outlinkMD != null) {
+            target.getMetaData().putAll(outlinkMD);
           }
 
           try {
-            // compute score contributions and adjustment to the original score
-            adjust = scfilters.distributeScoreToOutlinks(key, parseData, 
-                      targets, null, links.length);
+            scfilters.initialScore(targetUrl, target);
           } catch (ScoringFilterException e) {
-            LOG.warn("Cannot distribute score from " + key + ": " + 
e.getMessage());
+            LOG.warn("Cannot filter init score for url " + key
+                + ", using default: " + e.getMessage());
+            target.setScore(0.0f);
           }
-          for (Entry<Text, CrawlDatum> target : targets) {
-            crawlOut.append(target.getKey(), target.getValue());
-          }
-          if (adjust != null) crawlOut.append(key, adjust);
 
-          Outlink[] filteredLinks = outlinkList.toArray(new 
Outlink[outlinkList.size()]);
-          parseData = new ParseData(parseData.getStatus(), 
parseData.getTitle(), 
-                                    filteredLinks, parseData.getContentMeta(), 
-                                    parseData.getParseMeta());
-          dataOut.append(key, parseData);
-          if (!parse.isCanonical()) {
-            CrawlDatum datum = new CrawlDatum();
-            datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
-            String timeString = 
parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
-            try {
-              datum.setFetchTime(Long.parseLong(timeString));
-            } catch (Exception e) {
-              LOG.warn("Can't read fetch time for: " + key);
-              datum.setFetchTime(System.currentTimeMillis());
-            }
-            crawlOut.append(key, datum);
-          }
+          targets.add(new SimpleEntry(targetUrl, target));
+
+          // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
+          links[i].setUrl(toUrl);
+          outlinkList.add(links[i]);
+          validCount++;
+        }
+
+        try {
+          // compute score contributions and adjustment to the original score
+          adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets,
+              null, links.length);
+        } catch (ScoringFilterException e) {
+          LOG.warn("Cannot distribute score from " + key + ": "
+              + e.getMessage());
         }
-        
-        public void close(Reporter reporter) throws IOException {
-          textOut.close();
-          dataOut.close();
-          crawlOut.close();
+        for (Entry<Text, CrawlDatum> target : targets) {
+          crawlOut.append(target.getKey(), target.getValue());
         }
-        
-      };
-    
+        if (adjust != null)
+          crawlOut.append(key, adjust);
+
+        Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList
+            .size()]);
+        parseData = new ParseData(parseData.getStatus(), parseData.getTitle(),
+            filteredLinks, parseData.getContentMeta(), 
parseData.getParseMeta());
+        dataOut.append(key, parseData);
+        if (!parse.isCanonical()) {
+          CrawlDatum datum = new CrawlDatum();
+          datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
+          String timeString = parse.getData().getContentMeta()
+              .get(Nutch.FETCH_TIME_KEY);
+          try {
+            datum.setFetchTime(Long.parseLong(timeString));
+          } catch (Exception e) {
+            LOG.warn("Can't read fetch time for: " + key);
+            datum.setFetchTime(System.currentTimeMillis());
+          }
+          crawlOut.append(key, datum);
+        }
+      }
+
+      public void close(Reporter reporter) throws IOException {
+        textOut.close();
+        dataOut.close();
+        crawlOut.close();
+      }
+
+    };
+
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
@@ -311,12 +316,12 @@ public class ParseOutputFormat implement
       }
     }
     try {
-      if(normalizers != null) {
-        toUrl = normalizers.normalize(toUrl,
-                  urlNormalizerScope);   // normalize the url
+      if (normalizers != null) {
+        toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize
+                                                                  // the url
       }
       if (filters != null) {
-        toUrl = filters.filter(toUrl);   // filter the url
+        toUrl = filters.filter(toUrl); // filter the url
       }
       if (toUrl == null) {
         return null;


Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java Thu Jan 29 
05:38:59 2015
@@ -22,25 +22,23 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-
 /**
  * This class represents a natural ordering for which parsing plugin should get
  * called for a particular mimeType. It provides methods to store the
  * parse-plugins.xml data, and methods to retreive the name of the appropriate
  * parsing plugin for a contentType.
- *
+ * 
  * @author mattmann
  * @version 1.0
  */
 class ParsePluginList {
-  
+
   /* a map to link mimeType to an ordered list of parsing plugins */
   private Map<String, List<String>> fMimeTypeToPluginMap = null;
-  
+
   /* A list of aliases */
   private Map<String, String> aliases = null;
-  
-  
+
   /**
    * Constructs a new ParsePluginList
    */
@@ -48,7 +46,7 @@ class ParsePluginList {
     fMimeTypeToPluginMap = new HashMap<String, List<String>>();
     aliases = new HashMap<String, String>();
   }
-  
+
   List<String> getPluginList(String mimeType) {
     return fMimeTypeToPluginMap.get(mimeType);
   }
@@ -56,18 +54,18 @@ class ParsePluginList {
   void setAliases(Map<String, String> aliases) {
     this.aliases = aliases;
   }
-  
+
   Map<String, String> getAliases() {
     return aliases;
   }
-  
+
   void setPluginList(String mimeType, List<String> l) {
     fMimeTypeToPluginMap.put(mimeType, l);
   }
-  
+
   List<String> getSupportedMimeTypes() {
-    return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray(
-            new String[] {}));
+    return Arrays
+        .asList(fMimeTypeToPluginMap.keySet().toArray(new String[] {}));
   }
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Thu Jan 
29 05:38:59 2015
@@ -42,50 +42,50 @@ import org.apache.hadoop.conf.Configurat
 // Nutch imports
 import org.apache.nutch.util.NutchConfiguration;
 
-
 /**
  * A reader to load the information stored in the
  * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
- *
+ * 
  * @author mattmann
  * @version 1.0
  */
 class ParsePluginsReader {
-  
+
   /* our log stream */
-  public static final Logger LOG = 
LoggerFactory.getLogger(ParsePluginsReader.class);
-  
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ParsePluginsReader.class);
+
   /** The property name of the parse-plugins location */
   private static final String PP_FILE_PROP = "parse.plugin.file";
 
   /** the parse-plugins file */
   private String fParsePluginsFile = null;
 
-  
   /**
    * Constructs a new ParsePluginsReader
    */
-  public ParsePluginsReader() { }
-  
+  public ParsePluginsReader() {
+  }
+
   /**
    * Reads the <code>parse-plugins.xml</code> file and returns the
    * {@link #ParsePluginList} defined by it.
-   *
+   * 
    * @return A {@link #ParsePluginList} specified by the
    *         <code>parse-plugins.xml</code> file.
    * @throws Exception
-   *             If any parsing error occurs.
+   *           If any parsing error occurs.
    */
   public ParsePluginList parse(Configuration conf) {
-    
+
     ParsePluginList pList = new ParsePluginList();
-    
+
     // open up the XML file
     DocumentBuilderFactory factory = null;
     DocumentBuilder parser = null;
     Document document = null;
     InputSource inputSource = null;
-    
+
     InputStream ppInputStream = null;
     if (fParsePluginsFile != null) {
       URL parsePluginUrl = null;
@@ -94,56 +94,55 @@ class ParsePluginsReader {
         ppInputStream = parsePluginUrl.openStream();
       } catch (Exception e) {
         if (LOG.isWarnEnabled()) {
-          LOG.warn("Unable to load parse plugins file from URL " +
-                   "[" + fParsePluginsFile + "]. Reason is [" + e + "]");
+          LOG.warn("Unable to load parse plugins file from URL " + "["
+              + fParsePluginsFile + "]. Reason is [" + e + "]");
         }
         return pList;
       }
     } else {
-      ppInputStream = conf.getConfResourceAsInputStream(
-                          conf.get(PP_FILE_PROP));
+      ppInputStream = 
conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
     }
-    
+
     inputSource = new InputSource(ppInputStream);
-    
+
     try {
       factory = DocumentBuilderFactory.newInstance();
       parser = factory.newDocumentBuilder();
       document = parser.parse(inputSource);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Unable to parse [" + fParsePluginsFile + "]." +
-                 "Reason is [" + e + "]");
+        LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is ["
+            + e + "]");
       }
       return null;
     }
-    
+
     Element parsePlugins = document.getDocumentElement();
-    
+
     // build up the alias hash map
     Map<String, String> aliases = getAliases(parsePlugins);
     // And store it on the parse plugin list
     pList.setAliases(aliases);
-     
+
     // get all the mime type nodes
     NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
-    
+
     // iterate through the mime types
     for (int i = 0; i < mimeTypes.getLength(); i++) {
       Element mimeType = (Element) mimeTypes.item(i);
       String mimeTypeStr = mimeType.getAttribute("name");
-      
+
       // for each mimeType, get the plugin list
       NodeList pluginList = mimeType.getElementsByTagName("plugin");
-      
+
       // iterate through the plugins, add them in order read
       // OR if they have a special order="" attribute, then hold those in
       // a separate list, and then insert them into the final list at the
       // order specified
       if (pluginList != null && pluginList.getLength() > 0) {
         List<String> plugList = new ArrayList<String>(pluginList.getLength());
-        
-        for (int j = 0; j<pluginList.getLength(); j++) {
+
+        for (int j = 0; j < pluginList.getLength(); j++) {
           Element plugin = (Element) pluginList.item(j);
           String pluginId = plugin.getAttribute("id");
           String extId = aliases.get(pluginId);
@@ -163,110 +162,110 @@ class ParsePluginsReader {
             plugList.add(extId);
           }
         }
-        
+
         // now add the plugin list and map it to this mimeType
         pList.setPluginList(mimeTypeStr, plugList);
-        
+
       } else if (LOG.isWarnEnabled()) {
         LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: "
-                 + mimeTypeStr + ", continuing parse");
+            + mimeTypeStr + ", continuing parse");
       }
     }
     return pList;
   }
-  
+
   /**
    * Tests parsing of the parse-plugins.xml file. An alternative name for the
-   * file can be specified via the <code>--file</code> option, although the
-   * file must be located in the <code>$NUTCH_HOME/conf</code> directory.
-   *
+   * file can be specified via the <code>--file</code> option, although the 
file
+   * must be located in the <code>$NUTCH_HOME/conf</code> directory.
+   * 
    * @param args
-   *            Currently only the --file argument to specify an alternative
-   *            name for the parse-plugins.xml file is supported.
+   *          Currently only the --file argument to specify an alternative name
+   *          for the parse-plugins.xml file is supported.
    */
   public static void main(String[] args) throws Exception {
     String parsePluginFile = null;
     String usage = "ParsePluginsReader [--file <parse plugin file location>]";
-    
-    if (( args.length != 0 && args.length != 2 )
+
+    if ((args.length != 0 && args.length != 2)
         || (args.length == 2 && !"--file".equals(args[0]))) {
       System.err.println(usage);
       System.exit(1);
     }
-    
+
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("--file")) {
         parsePluginFile = args[++i];
       }
     }
-    
+
     ParsePluginsReader reader = new ParsePluginsReader();
-    
+
     if (parsePluginFile != null) {
       reader.setFParsePluginsFile(parsePluginFile);
     }
-    
+
     ParsePluginList prefs = reader.parse(NutchConfiguration.create());
-    
+
     for (String mimeType : prefs.getSupportedMimeTypes()) {
-      
+
       System.out.println("MIMETYPE: " + mimeType);
       List<String> plugList = prefs.getPluginList(mimeType);
-      
+
       System.out.println("EXTENSION IDs:");
-      
+
       for (String j : plugList) {
         System.out.println(j);
       }
     }
-    
+
   }
-  
+
   /**
    * @return Returns the fParsePluginsFile.
    */
   public String getFParsePluginsFile() {
     return fParsePluginsFile;
   }
-  
+
   /**
    * @param parsePluginsFile
-   *            The fParsePluginsFile to set.
+   *          The fParsePluginsFile to set.
    */
   public void setFParsePluginsFile(String parsePluginsFile) {
     fParsePluginsFile = parsePluginsFile;
   }
-  
+
   private Map<String, String> getAliases(Element parsePluginsRoot) {
 
     Map<String, String> aliases = new HashMap<String, String>();
     NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
-         
+
     if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 
0)) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("No aliases defined in parse-plugins.xml!");
       }
       return aliases;
     }
-         
+
     if (aliasRoot.getLength() > 1) {
       // log a warning, but try and continue processing
       if (LOG.isWarnEnabled()) {
         LOG.warn("There should only be one \"aliases\" tag in 
parse-plugins.xml");
       }
     }
-         
-    Element aliasRootElem = (Element)aliasRoot.item(0);
+
+    Element aliasRootElem = (Element) aliasRoot.item(0);
     NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");
-         
+
     if (aliasElements != null && aliasElements.getLength() > 0) {
-      for (int i=0; i<aliasElements.getLength(); i++) {
-        Element aliasElem = (Element)aliasElements.item(i);
-       String parsePluginId = aliasElem.getAttribute("name");
-       String extensionId = aliasElem.getAttribute("extension-id");
+      for (int i = 0; i < aliasElements.getLength(); i++) {
+        Element aliasElem = (Element) aliasElements.item(i);
+        String parsePluginId = aliasElem.getAttribute("name");
+        String extensionId = aliasElem.getAttribute("extension-id");
         if (LOG.isTraceEnabled()) {
-          LOG.trace("Found alias: plugin-id: " + parsePluginId +
-                    ", extension-id: " + extensionId);
+          LOG.trace("Found alias: plugin-id: " + parsePluginId
+              + ", extension-id: " + extensionId);
         }
         if (parsePluginId != null && extensionId != null) {
           aliases.put(parsePluginId, extensionId);
@@ -275,5 +274,5 @@ class ParsePluginsReader {
     }
     return aliases;
   }
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java Thu Jan 29 
05:38:59 2015
@@ -27,94 +27,116 @@ import org.slf4j.LoggerFactory;
 import org.apache.hadoop.io.Text;
 
 /**
- * A utility class that stores result of a parse. Internally
- * a ParseResult stores &lt;{@link Text}, {@link Parse}&gt; pairs.
- * <p>Parsers may return multiple results, which correspond to parts
- * or other associated documents related to the original URL.</p>
- * <p>There will be usually one parse result that corresponds directly
- * to the original URL, and possibly many (or none) results that correspond
- * to derived URLs (or sub-URLs).
+ * A utility class that stores result of a parse. Internally a ParseResult
+ * stores &lt;{@link Text}, {@link Parse}&gt; pairs.
+ * <p>
+ * Parsers may return multiple results, which correspond to parts or other
+ * associated documents related to the original URL.
+ * </p>
+ * <p>
+ * There will be usually one parse result that corresponds directly to the
+ * original URL, and possibly many (or none) results that correspond to derived
+ * URLs (or sub-URLs).
  */
 public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
   private Map<Text, Parse> parseMap;
   private String originalUrl;
-  
+
   public static final Logger LOG = LoggerFactory.getLogger(ParseResult.class);
-  
+
   /**
    * Create a container for parse results.
-   * @param originalUrl the original url from which all parse results
-   * have been obtained.
+   * 
+   * @param originalUrl
+   *          the original url from which all parse results have been obtained.
    */
   public ParseResult(String originalUrl) {
     parseMap = new HashMap<Text, Parse>();
     this.originalUrl = originalUrl;
   }
-  
+
   /**
    * Convenience method for obtaining {@link ParseResult} from a single
    * <code>Parse</code> output.
-   * @param url canonical url.
-   * @param parse single parse output.
+   * 
+   * @param url
+   *          canonical url.
+   * @param parse
+   *          single parse output.
    * @return result containing the single parse output.
    */
   public static ParseResult createParseResult(String url, Parse parse) {
     ParseResult parseResult = new ParseResult(url);
-    parseResult.put(new Text(url), new ParseText(parse.getText()), 
parse.getData());
+    parseResult.put(new Text(url), new ParseText(parse.getText()),
+        parse.getData());
     return parseResult;
   }
-  
+
   /**
    * Checks whether the result is empty.
+   * 
    * @return
    */
   public boolean isEmpty() {
     return parseMap.isEmpty();
   }
-  
+
   /**
    * Return the number of parse outputs (both successful and failed)
    */
   public int size() {
     return parseMap.size();
   }
-  
+
   /**
    * Retrieve a single parse output.
-   * @param key sub-url under which the parse output is stored.
+   * 
+   * @param key
+   *          sub-url under which the parse output is stored.
    * @return parse output corresponding to this sub-url, or null.
    */
   public Parse get(String key) {
     return get(new Text(key));
   }
-  
+
   /**
    * Retrieve a single parse output.
-   * @param key sub-url under which the parse output is stored.
+   * 
+   * @param key
+   *          sub-url under which the parse output is stored.
    * @return parse output corresponding to this sub-url, or null.
    */
   public Parse get(Text key) {
     return parseMap.get(key);
   }
-  
+
   /**
    * Store a result of parsing.
-   * @param key URL or sub-url of this parse result
-   * @param text plain text result
-   * @param data corresponding parse metadata of this result
+   * 
+   * @param key
+   *          URL or sub-url of this parse result
+   * @param text
+   *          plain text result
+   * @param data
+   *          corresponding parse metadata of this result
    */
   public void put(Text key, ParseText text, ParseData data) {
     put(key.toString(), text, data);
   }
-  
+
   /**
    * Store a result of parsing.
-   * @param key URL or sub-url of this parse result
-   * @param text plain text result
-   * @param data corresponding parse metadata of this result
+   * 
+   * @param key
+   *          URL or sub-url of this parse result
+   * @param text
+   *          plain text result
+   * @param data
+   *          corresponding parse metadata of this result
    */
   public void put(String key, ParseText text, ParseData data) {
-    parseMap.put(new Text(key), new ParseImpl(text, data, 
key.equals(originalUrl)));
+    parseMap.put(new Text(key),
+        new ParseImpl(text, data, key.equals(originalUrl)));
   }
 
   /**
@@ -123,21 +145,21 @@ public class ParseResult implements Iter
   public Iterator<Entry<Text, Parse>> iterator() {
     return parseMap.entrySet().iterator();
   }
-  
+
   /**
-   * Remove all results where status is not successful (as determined
-   * by </code>ParseStatus#isSuccess()</code>). Note that effects of this 
operation
+   * Remove all results where status is not successful (as determined by
+   * </code>ParseStatus#isSuccess()</code>). Note that effects of this 
operation
    * cannot be reversed.
    */
   public void filter() {
-    for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+    for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
       Entry<Text, Parse> entry = i.next();
       if (!entry.getValue().getData().getStatus().isSuccess()) {
         LOG.warn(entry.getKey() + " is not parsed successfully, filtering");
         i.remove();
       }
     }
-      
+
   }
 
   /**
@@ -145,7 +167,7 @@ public class ParseResult implements Iter
    * Parse success is determined by <code>ParseStatus#isSuccess()</code>.
    */
   public boolean isSuccess() {
-    for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+    for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
       Entry<Text, Parse> entry = i.next();
       if (!entry.getValue().getData().getStatus().isSuccess()) {
         return false;

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu Jan 29 
05:38:59 2015
@@ -46,19 +46,19 @@ public class ParseSegment extends Config
     Reducer<Text, Writable, Text, Writable> {
 
   public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
-  
+
   public static final String SKIP_TRUNCATED = "parser.skip.truncated";
-  
+
   private ScoringFilters scfilters;
-  
+
   private ParseUtil parseUtil;
-  
+
   private boolean skipTruncated;
-  
+
   public ParseSegment() {
     this(null);
   }
-  
+
   public ParseSegment(Configuration conf) {
     super(conf);
   }
@@ -66,41 +66,43 @@ public class ParseSegment extends Config
   public void configure(JobConf job) {
     setConf(job);
     this.scfilters = new ScoringFilters(job);
-    skipTruncated=job.getBoolean(SKIP_TRUNCATED, true);
+    skipTruncated = job.getBoolean(SKIP_TRUNCATED, true);
+  }
+
+  public void close() {
   }
 
-  public void close() {}
-  
   private Text newKey = new Text();
 
   public void map(WritableComparable<?> key, Content content,
-                  OutputCollector<Text, ParseImpl> output, Reporter reporter)
-    throws IOException {
+      OutputCollector<Text, ParseImpl> output, Reporter reporter)
+      throws IOException {
     // convert on the fly from old UTF8 keys
     if (key instanceof Text) {
       newKey.set(key.toString());
       key = newKey;
     }
-    
-    int status =
-      Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY));
+
+    int status = Integer.parseInt(content.getMetadata().get(
+        Nutch.FETCH_STATUS_KEY));
     if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
       // content not fetched successfully, skip document
       LOG.debug("Skipping " + key + " as content is not fetched successfully");
       return;
     }
-    
+
     if (skipTruncated && isTruncated(content)) {
       return;
     }
 
     ParseResult parseResult = null;
     try {
-      if (parseUtil == null) 
+      if (parseUtil == null)
         parseUtil = new ParseUtil(getConf());
       parseResult = parseUtil.parse(content);
     } catch (Exception e) {
-      LOG.warn("Error parsing: " + key + ": " + 
StringUtils.stringifyException(e));
+      LOG.warn("Error parsing: " + key + ": "
+          + StringUtils.stringifyException(e));
       return;
     }
 
@@ -111,7 +113,8 @@ public class ParseSegment extends Config
 
       long start = System.currentTimeMillis();
 
-      reporter.incrCounter("ParserStatus", 
ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
+      reporter.incrCounter("ParserStatus",
+          ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
 
       if (!parseStatus.isSuccess()) {
         LOG.warn("Error parsing: " + key + ": " + parseStatus);
@@ -119,45 +122,51 @@ public class ParseSegment extends Config
       }
 
       // pass segment name to parse data
-      parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
-                                           
getConf().get(Nutch.SEGMENT_NAME_KEY));
+      parse.getData().getContentMeta()
+          .set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY));
 
       // compute the new signature
-      byte[] signature = 
-        SignatureFactory.getSignature(getConf()).calculate(content, parse); 
-      parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
-          StringUtil.toHexString(signature));
-      
+      byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+          content, parse);
+      parse.getData().getContentMeta()
+          .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+
       try {
         scfilters.passScoreAfterParsing(url, content, parse);
       } catch (ScoringFilterException e) {
         if (LOG.isWarnEnabled()) {
-          LOG.warn("Error passing score: "+ url +": "+e.getMessage());
+          LOG.warn("Error passing score: " + url + ": " + e.getMessage());
         }
       }
 
       long end = System.currentTimeMillis();
       LOG.info("Parsed (" + Long.toString(end - start) + "ms):" + url);
 
-      output.collect(url, new ParseImpl(new ParseText(parse.getText()), 
-                                        parse.getData(), parse.isCanonical()));
+      output.collect(
+          url,
+          new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse
+              .isCanonical()));
     }
   }
-  
+
   /**
    * Checks if the page's content is truncated.
+   * 
    * @param content
-   * @return If the page is truncated <code>true</code>. When it is not,
-   * or when it could be determined, <code>false</code>. 
+   * @return If the page is truncated <code>true</code>. When it is not, or 
when
+   *         it could be determined, <code>false</code>.
    */
   public static boolean isTruncated(Content content) {
     byte[] contentBytes = content.getContent();
-    if (contentBytes == null) return false;
+    if (contentBytes == null)
+      return false;
     Metadata metadata = content.getMetadata();
-    if (metadata == null) return false;
-    
+    if (metadata == null)
+      return false;
+
     String lengthStr = metadata.get(Response.CONTENT_LENGTH);
-    if (lengthStr != null) lengthStr=lengthStr.trim();
+    if (lengthStr != null)
+      lengthStr = lengthStr.trim();
     if (StringUtil.isEmpty(lengthStr)) {
       return false;
     }
@@ -176,14 +185,15 @@ public class ParseSegment extends Config
       return true;
     }
     if (LOG.isDebugEnabled()) {
-      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + 
inHeaderSize);
+      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+          + inHeaderSize);
     }
     return false;
   }
 
   public void reduce(Text key, Iterator<Writable> values,
-                     OutputCollector<Text, Writable> output, Reporter reporter)
-    throws IOException {
+      OutputCollector<Text, Writable> output, Reporter reporter)
+      throws IOException {
     output.collect(key, values.next()); // collect first value
   }
 
@@ -204,7 +214,7 @@ public class ParseSegment extends Config
     job.setInputFormat(SequenceFileInputFormat.class);
     job.setMapperClass(ParseSegment.class);
     job.setReducerClass(ParseSegment.class);
-    
+
     FileOutputFormat.setOutputPath(job, segment);
     job.setOutputFormat(ParseOutputFormat.class);
     job.setOutputKeyClass(Text.class);
@@ -212,15 +222,16 @@ public class ParseSegment extends Config
 
     JobClient.runJob(job);
     long end = System.currentTimeMillis();
-    LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + 
TimingUtil.elapsedTime(start, end));
+    LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
   }
 
-
   public static void main(String[] args) throws Exception {
-       int res = ToolRunner.run(NutchConfiguration.create(), new 
ParseSegment(), args);
-       System.exit(res);
+    int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(),
+        args);
+    System.exit(res);
   }
-         
+
   public int run(String[] args) throws Exception {
     Path segment;
 
@@ -231,11 +242,11 @@ public class ParseSegment extends Config
       System.exit(-1);
     }
 
-    if(args.length > 1) {
-      for(int i = 1; i < args.length; i++) {
+    if (args.length > 1) {
+      for (int i = 1; i < args.length; i++) {
         String param = args[i];
 
-        if("-nofilter".equalsIgnoreCase(param)) {
+        if ("-nofilter".equalsIgnoreCase(param)) {
           getConf().setBoolean("parse.filter.urls", false);
         } else if ("-nonormalize".equalsIgnoreCase(param)) {
           getConf().setBoolean("parse.normalize.urls", false);

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Thu Jan 29 
05:38:59 2015
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 /*
  * Created on Apr 28, 2005
  * Author: Andrzej Bialecki &lt;[email protected]&gt;
@@ -32,113 +32,119 @@ import org.apache.hadoop.conf.Configurat
 
 import org.apache.nutch.metadata.Metadata;
 
-
 /**
  * @author Andrzej Bialecki &lt;[email protected]&gt;
  */
 public class ParseStatus implements Writable {
-  
+
   private final static byte VERSION = 2;
-  
+
   // Primary status codes:
-  
+
   /** Parsing was not performed. */
-  public static final byte NOTPARSED       = 0;
+  public static final byte NOTPARSED = 0;
   /** Parsing succeeded. */
-  public static final byte SUCCESS         = 1;
+  public static final byte SUCCESS = 1;
   /** General failure. There may be a more specific error message in 
arguments. */
-  public static final byte FAILED          = 2;
-  
-  public static final String[] majorCodes = {
-          "notparsed",
-          "success",
-          "failed"
-  };
-  
+  public static final byte FAILED = 2;
+
+  public static final String[] majorCodes = { "notparsed", "success", "failed" 
};
+
   // Secondary success codes go here:
-  
-  /** Parsed content contains a directive to redirect to another URL.
-   * The target URL can be retrieved from the arguments.
+
+  /**
+   * Parsed content contains a directive to redirect to another URL. The target
+   * URL can be retrieved from the arguments.
    */
-  public static final short SUCCESS_REDIRECT          = 100;
-  
+  public static final short SUCCESS_REDIRECT = 100;
+
   // Secondary failure codes go here:
-  
-  /** Parsing failed. An Exception occured (which may be retrieved from the 
arguments). */
-  public static final short FAILED_EXCEPTION          = 200;
-  /** Parsing failed. Content was truncated, but the parser cannot handle 
incomplete content. */
-  public static final short FAILED_TRUNCATED          = 202;
-  /** Parsing failed. Invalid format - the content may be corrupted or of 
wrong type. */
-  public static final short FAILED_INVALID_FORMAT     = 203;
-  /** Parsing failed. Other related parts of the content are needed to complete
+
+  /**
+   * Parsing failed. An Exception occured (which may be retrieved from the
+   * arguments).
+   */
+  public static final short FAILED_EXCEPTION = 200;
+  /**
+   * Parsing failed. Content was truncated, but the parser cannot handle
+   * incomplete content.
+   */
+  public static final short FAILED_TRUNCATED = 202;
+  /**
+   * Parsing failed. Invalid format - the content may be corrupted or of wrong
+   * type.
+   */
+  public static final short FAILED_INVALID_FORMAT = 203;
+  /**
+   * Parsing failed. Other related parts of the content are needed to complete
    * parsing. The list of URLs to missing parts may be provided in arguments.
    * The Fetcher may decide to fetch these parts at once, then put them into
    * Content.metadata, and supply them for re-parsing.
    */
-  public static final short FAILED_MISSING_PARTS      = 204;
-  /** Parsing failed. There was no content to be parsed - probably caused
-   * by errors at protocol stage.
+  public static final short FAILED_MISSING_PARTS = 204;
+  /**
+   * Parsing failed. There was no content to be parsed - probably caused by
+   * errors at protocol stage.
    */
-  public static final short FAILED_MISSING_CONTENT    = 205;
-
+  public static final short FAILED_MISSING_CONTENT = 205;
 
   public static final ParseStatus STATUS_NOTPARSED = new 
ParseStatus(NOTPARSED);
   public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
   public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
-  
+
   private byte majorCode = 0;
   private short minorCode = 0;
   private String[] args = null;
-  
+
   public byte getVersion() {
     return VERSION;
   }
 
   public ParseStatus() {
-    
+
   }
-  
+
   public ParseStatus(int majorCode, int minorCode, String[] args) {
     this.args = args;
-    this.majorCode = (byte)majorCode;
-    this.minorCode = (short)minorCode;
+    this.majorCode = (byte) majorCode;
+    this.minorCode = (short) minorCode;
   }
-  
+
   public ParseStatus(int majorCode) {
-    this(majorCode, 0, (String[])null);
+    this(majorCode, 0, (String[]) null);
   }
-  
+
   public ParseStatus(int majorCode, String[] args) {
     this(majorCode, 0, args);
   }
-  
+
   public ParseStatus(int majorCode, int minorCode) {
-    this(majorCode, minorCode, (String[])null);
+    this(majorCode, minorCode, (String[]) null);
   }
-  
+
   /** Simplified constructor for passing just a text message. */
   public ParseStatus(int majorCode, int minorCode, String message) {
-    this(majorCode, minorCode, new String[]{message});
+    this(majorCode, minorCode, new String[] { message });
   }
-  
+
   /** Simplified constructor for passing just a text message. */
   public ParseStatus(int majorCode, String message) {
-    this(majorCode, 0, new String[]{message});
+    this(majorCode, 0, new String[] { message });
   }
-  
+
   public ParseStatus(Throwable t) {
-    this(FAILED, FAILED_EXCEPTION, new String[]{t.toString()});
+    this(FAILED, FAILED_EXCEPTION, new String[] { t.toString() });
   }
-  
+
   public static ParseStatus read(DataInput in) throws IOException {
     ParseStatus res = new ParseStatus();
     res.readFields(in);
     return res;
   }
-  
+
   public void readFields(DataInput in) throws IOException {
     byte version = in.readByte();
-    switch(version) {
+    switch (version) {
     case 1:
       majorCode = in.readByte();
       minorCode = in.readShort();
@@ -152,8 +158,8 @@ public class ParseStatus implements Writ
     default:
       throw new VersionMismatchException(VERSION, version);
     }
- }
-  
+  }
+
   public void write(DataOutput out) throws IOException {
     out.writeByte(VERSION);
     out.writeByte(majorCode);
@@ -164,55 +170,61 @@ public class ParseStatus implements Writ
       WritableUtils.writeStringArray(out, args);
     }
   }
-  
-  /** A convenience method. Returns true if majorCode is SUCCESS, false
+
+  /**
+   * A convenience method. Returns true if majorCode is SUCCESS, false
    * otherwise.
    */
-  
+
   public boolean isSuccess() {
     return majorCode == SUCCESS;
   }
-  
-  /** A convenience method. Return a String representation of the first
-   * argument, or null.
+
+  /**
+   * A convenience method. Return a String representation of the first 
argument,
+   * or null.
    */
   public String getMessage() {
     if (args != null && args.length > 0 && args[0] != null)
       return args[0];
     return null;
   }
-  
+
   public String[] getArgs() {
     return args;
   }
-  
+
   public int getMajorCode() {
     return majorCode;
   }
-  
+
   public int getMinorCode() {
     return minorCode;
   }
-  
-  /** A convenience method. Creates an empty Parse instance,
-   * which returns this status.
+
+  /**
+   * A convenience method. Creates an empty Parse instance, which returns this
+   * status.
    */
   public Parse getEmptyParse(Configuration conf) {
     return new EmptyParseImpl(this, conf);
   }
-  
-  /** A convenience method. Creates an empty ParseResult,
-   * which contains this status.
+
+  /**
+   * A convenience method. Creates an empty ParseResult, which contains this
+   * status.
    */
   public ParseResult getEmptyParseResult(String url, Configuration conf) {
     return ParseResult.createParseResult(url, getEmptyParse(conf));
   }
-  
+
   public String toString() {
     StringBuffer res = new StringBuffer();
     String name = null;
-    if (majorCode >= 0 && majorCode < majorCodes.length) name = 
majorCodes[majorCode];
-    else name = "UNKNOWN!";
+    if (majorCode >= 0 && majorCode < majorCodes.length)
+      name = majorCodes[majorCode];
+    else
+      name = "UNKNOWN!";
     res.append(name + "(" + majorCode + "," + minorCode + ")");
     if (args != null) {
       if (args.length == 1) {
@@ -226,18 +238,18 @@ public class ParseStatus implements Writ
     }
     return res.toString();
   }
-  
+
   public void setArgs(String[] args) {
     this.args = args;
   }
-  
+
   public void setMessage(String msg) {
     if (args == null || args.length == 0) {
       args = new String[1];
     }
     args[0] = msg;
   }
-  
+
   public void setMajorCode(byte majorCode) {
     this.majorCode = majorCode;
   }
@@ -245,37 +257,45 @@ public class ParseStatus implements Writ
   public void setMinorCode(short minorCode) {
     this.minorCode = minorCode;
   }
-  
+
   public boolean equals(Object o) {
-    if (o == null) return false;
-    if (!(o instanceof ParseStatus)) return false;
+    if (o == null)
+      return false;
+    if (!(o instanceof ParseStatus))
+      return false;
     boolean res = true;
-    ParseStatus other = (ParseStatus)o;
-    res = res && (this.majorCode == other.majorCode) &&
-      (this.minorCode == other.minorCode);
-    if (!res) return res;
+    ParseStatus other = (ParseStatus) o;
+    res = res && (this.majorCode == other.majorCode)
+        && (this.minorCode == other.minorCode);
+    if (!res)
+      return res;
     if (this.args == null) {
-      if (other.args == null) return true;
-      else return false;
+      if (other.args == null)
+        return true;
+      else
+        return false;
     } else {
-      if (other.args == null) return false;
-      if (other.args.length != this.args.length) return false;
+      if (other.args == null)
+        return false;
+      if (other.args.length != this.args.length)
+        return false;
       for (int i = 0; i < this.args.length; i++) {
-        if (!this.args[i].equals(other.args[i])) return false;
+        if (!this.args[i].equals(other.args[i]))
+          return false;
       }
     }
     return true;
   }
-  
+
   private static class EmptyParseImpl implements Parse {
-    
+
     private ParseData data = null;
-    
+
     public EmptyParseImpl(ParseStatus status, Configuration conf) {
-      data = new ParseData(status, "", new Outlink[0],
-                           new Metadata(), new Metadata());
+      data = new ParseData(status, "", new Outlink[0], new Metadata(),
+          new Metadata());
     }
-    
+
     public ParseData getData() {
       return data;
     }
@@ -283,10 +303,9 @@ public class ParseStatus implements Writ
     public String getText() {
       return "";
     }
-    
+
     public boolean isCanonical() {
       return true;
     }
   }
 }
-

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java Thu Jan 29 
05:38:59 2015
@@ -33,10 +33,12 @@ public final class ParseText implements
 
   private final static byte VERSION = 2;
 
-  public ParseText() {}
+  public ParseText() {
+  }
+
   private String text;
-    
-  public ParseText(String text){
+
+  public ParseText(String text) {
     this.text = text;
   }
 
@@ -68,12 +70,14 @@ public final class ParseText implements
   //
   // Accessor methods
   //
-  public String getText()  { return text; }
+  public String getText() {
+    return text;
+  }
 
   public boolean equals(Object o) {
     if (!(o instanceof ParseText))
       return false;
-    ParseText other = (ParseText)o;
+    ParseText other = (ParseText) o;
     return this.text.equals(other.text);
   }
 
@@ -90,12 +94,11 @@ public final class ParseText implements
     }
     Options opts = new Options();
     Configuration conf = NutchConfiguration.create();
-    
-    GenericOptionsParser parser =
-      new GenericOptionsParser(conf, opts, argv);
-    
+
+    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
     String[] remainingArgs = parser.getRemainingArgs();
-    
+
     FileSystem fs = FileSystem.get(conf);
     try {
       int recno = Integer.parseInt(remainingArgs[0]);

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Thu Jan 29 
05:38:59 2015
@@ -30,127 +30,136 @@ import org.apache.nutch.protocol.Content
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 
-
 /**
  * A Utility class containing methods to simply perform parsing utilities such
  * as iterating through a preferred list of {@link Parser}s to obtain
  * {@link Parse} objects.
- *
+ * 
  * @author mattmann
  * @author J&eacute;r&ocirc;me Charron
  * @author S&eacute;bastien Le Callonnec
  */
 public class ParseUtil {
-  
+
   /* our log stream */
   public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
   private ParserFactory parserFactory;
   /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
   private int maxParseTime = 30;
   private ExecutorService executorService;
-  
+
   /**
    * 
    * @param conf
    */
   public ParseUtil(Configuration conf) {
     this.parserFactory = new ParserFactory(conf);
-    maxParseTime=conf.getInt("parser.timeout", 30);
+    maxParseTime = conf.getInt("parser.timeout", 30);
     executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
-      .setNameFormat("parse-%d").setDaemon(true).build());
+        .setNameFormat("parse-%d").setDaemon(true).build());
   }
-  
+
   /**
    * Performs a parse by iterating through a List of preferred {@link Parser}s
    * until a successful parse is performed and a {@link Parse} object is
    * returned. If the parse is unsuccessful, a message is logged to the
    * <code>WARNING</code> level, and an empty parse is returned.
-   *
-   * @param content The content to try and parse.
+   * 
+   * @param content
+   *          The content to try and parse.
    * @return &lt;key, {@link Parse}&gt; pairs.
-   * @throws ParseException If no suitable parser is found to perform the 
parse.
+   * @throws ParseException
+   *           If no suitable parser is found to perform the parse.
    */
   public ParseResult parse(Content content) throws ParseException {
     Parser[] parsers = null;
-    
+
     try {
-      parsers = this.parserFactory.getParsers(content.getContentType(), 
-                content.getUrl() != null ? content.getUrl():"");
+      parsers = this.parserFactory.getParsers(content.getContentType(),
+          content.getUrl() != null ? content.getUrl() : "");
     } catch (ParserNotFound e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("No suitable parser found when trying to parse content " + 
content.getUrl() +
-               " of type " + content.getContentType());
+        LOG.warn("No suitable parser found when trying to parse content "
+            + content.getUrl() + " of type " + content.getContentType());
       }
       throw new ParseException(e.getMessage());
     }
-    
+
     ParseResult parseResult = null;
-    for (int i=0; i<parsers.length; i++) {
+    for (int i = 0; i < parsers.length; i++) {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + 
"]");
+        LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i]
+            + "]");
       }
-      if (maxParseTime!=-1)
-       parseResult = runParser(parsers[i], content);
-      else 
-       parseResult = parsers[i].getParse(content);
+      if (maxParseTime != -1)
+        parseResult = runParser(parsers[i], content);
+      else
+        parseResult = parsers[i].getParse(content);
 
       if (parseResult != null && !parseResult.isEmpty())
         return parseResult;
     }
-   
-    if (LOG.isWarnEnabled()) { 
-      LOG.warn("Unable to successfully parse content " + content.getUrl() +
-               " of type " + content.getContentType());
-    }
-    return new ParseStatus(new ParseException("Unable to successfully parse 
content")).getEmptyParseResult(content.getUrl(), null);
+
+    if (LOG.isWarnEnabled()) {
+      LOG.warn("Unable to successfully parse content " + content.getUrl()
+          + " of type " + content.getContentType());
+    }
+    return new ParseStatus(new ParseException(
+        "Unable to successfully parse content")).getEmptyParseResult(
+        content.getUrl(), null);
   }
-    
+
   /**
    * Method parses a {@link Content} object using the {@link Parser} specified
-   * by the parameter <code>extId</code>, i.e., the Parser's extension ID.
-   * If a suitable {@link Parser} is not found, then a <code>WARNING</code>
-   * level message is logged, and a ParseException is thrown. If the parse is
-   * uncessful for any other reason, then a <code>WARNING</code> level
-   * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is
-   * returned.
-   *
-   * @param extId The extension implementation ID of the {@link Parser} to use
-   *              to parse the specified content.
-   * @param content The content to parse.
-   *
-   * @return &lt;key, {@link Parse}&gt; pairs if the parse is successful, 
otherwise,
-   *         a single &lt;key, <code>ParseStatus.getEmptyParse()</code>&gt; 
pair.
-   *
-   * @throws ParseException If there is no suitable {@link Parser} found
-   *                        to perform the parse.
+   * by the parameter <code>extId</code>, i.e., the Parser's extension ID. If a
+   * suitable {@link Parser} is not found, then a <code>WARNING</code> level
+   * message is logged, and a ParseException is thrown. If the parse is
+   * uncessful for any other reason, then a <code>WARNING</code> level message
+   * is logged, and a <code>ParseStatus.getEmptyParse()</code> is returned.
+   * 
+   * @param extId
+   *          The extension implementation ID of the {@link Parser} to use to
+   *          parse the specified content.
+   * @param content
+   *          The content to parse.
+   * 
+   * @return &lt;key, {@link Parse}&gt; pairs if the parse is successful,
+   *         otherwise, a single &lt;key,
+   *         <code>ParseStatus.getEmptyParse()</code>&gt; pair.
+   * 
+   * @throws ParseException
+   *           If there is no suitable {@link Parser} found to perform the
+   *           parse.
    */
   public ParseResult parseByExtensionId(String extId, Content content)
-  throws ParseException {
+      throws ParseException {
     Parser p = null;
-    
+
     try {
       p = this.parserFactory.getParserById(extId);
     } catch (ParserNotFound e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("No suitable parser found when trying to parse content " + 
content.getUrl() +
-            " of type " + content.getContentType());
+        LOG.warn("No suitable parser found when trying to parse content "
+            + content.getUrl() + " of type " + content.getContentType());
       }
       throw new ParseException(e.getMessage());
     }
-    
+
     ParseResult parseResult = null;
-    if (maxParseTime!=-1)
-       parseResult = runParser(p, content);
-    else 
-       parseResult = p.getParse(content);
+    if (maxParseTime != -1)
+      parseResult = runParser(p, content);
+    else
+      parseResult = p.getParse(content);
     if (parseResult != null && !parseResult.isEmpty()) {
       return parseResult;
     } else {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Unable to successfully parse content " + content.getUrl() +
-            " of type " + content.getContentType());
-      }  
-      return new ParseStatus(new ParseException("Unable to successfully parse 
content")).getEmptyParseResult(content.getUrl(), null);
+        LOG.warn("Unable to successfully parse content " + content.getUrl()
+            + " of type " + content.getContentType());
+      }
+      return new ParseStatus(new ParseException(
+          "Unable to successfully parse content")).getEmptyParseResult(
+          content.getUrl(), null);
     }
   }
 
@@ -168,5 +177,5 @@ public class ParseUtil {
     }
     return res;
   }
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/Parser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/Parser.java Thu Jan 29 05:38:59 
2015
@@ -24,33 +24,35 @@ import org.apache.hadoop.conf.Configurab
 import org.apache.nutch.plugin.Pluggable;
 import org.apache.nutch.protocol.Content;
 
-/** A parser for content generated by a {@link 
org.apache.nutch.protocol.Protocol}
- * implementation.  This interface is implemented by extensions.  Nutch's core
- * contains no page parsing code.
+/**
+ * A parser for content generated by a
+ * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is
+ * implemented by extensions. Nutch's core contains no page parsing code.
  */
 public interface Parser extends Pluggable, Configurable {
   /** The name of the extension point. */
   public final static String X_POINT_ID = Parser.class.getName();
 
-  /** 
+  /**
    * <p>
-   * This method parses the given content and returns a map of
-   * &lt;key, parse&gt; pairs. {@link Parse} instances will be persisted 
-   * under the given key.
+   * This method parses the given content and returns a map of &lt;key,
+   * parse&gt; pairs. {@link Parse} instances will be persisted under the given
+   * key.
    * </p>
    * <p>
-   * Note: Meta-redirects should be followed only when they are coming from
-   * the original URL. That is: <br> 
+   * Note: Meta-redirects should be followed only when they are coming from the
+   * original URL. That is: <br>
    * Assume fetcher is in parsing mode and is currently processing
-   * foo.bar.com/redirect.html. If this url contains a meta redirect
-   * to another url, fetcher should only follow the redirect if the map
-   * contains an entry of the form &lt;"foo.bar.com/redirect.html", 
-   * {@link Parse} with a {@link ParseStatus} indicating the redirect&gt;.
+   * foo.bar.com/redirect.html. If this url contains a meta redirect to another
+   * url, fetcher should only follow the redirect if the map contains an entry
+   * of the form &lt;"foo.bar.com/redirect.html", {@link Parse} with a
+   * {@link ParseStatus} indicating the redirect&gt;.
    * </p>
    * 
-   * @param c Content to be parsed
+   * @param c
+   *          Content to be parsed
    * @return a map containing &lt;key, parse&gt; pairs
    * @since NUTCH-443
    */
-   ParseResult getParse(Content c);
+  ParseResult getParse(Content c);
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Thu Jan 29 
05:38:59 2015
@@ -40,28 +40,30 @@ import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.StringUtil;
 
 /**
- * Parser checker, useful for testing parser.
- * It also accurately reports possible fetching and 
- * parsing failures and presents protocol status signals to aid 
- * debugging. The tool enables us to retrieve the following data from 
- * any url:
+ * Parser checker, useful for testing parser. It also accurately reports
+ * possible fetching and parsing failures and presents protocol status signals
+ * to aid debugging. The tool enables us to retrieve the following data from 
any
+ * url:
  * <ol>
- * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} 
type.</li>
- * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) 
and is used to remove
- * duplicates during the dedup procedure. 
- * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
+ * type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) 
and
+ * is used to remove duplicates during the dedup procedure. It is calculated
+ * using {@link org.apache.nutch.crawl.MD5Signature} or
  * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
  * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
  * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
  * <li><tt>Title</tt>: of the URL</li>
  * <li><tt>Outlinks</tt>: associated with the URL</li>
  * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
- * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, 
<i>Cache-Control</>, etc.</li>
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
+ * <i>Cache-Control</>, etc.</li>
  * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
  * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
- * <li><tt>ParseText</tt>: The page parse text which varies in length 
depdnecing on 
- * <code>content.length</code> configuration.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length 
depdnecing
+ * on <code>content.length</code> configuration.</li>
  * </ol>
+ * 
  * @author John Xing
  */
 
@@ -132,12 +134,13 @@ public class ParserChecker implements To
     Protocol protocol = factory.getProtocol(url);
     Text turl = new Text(url);
     ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
-    
+
     if (!output.getStatus().isSuccess()) {
-      System.err.println("Fetch failed with protocol status: " + 
output.getStatus());
+      System.err.println("Fetch failed with protocol status: "
+          + output.getStatus());
       return (-1);
     }
-    
+
     Content content = output.getContent();
 
     if (content == null) {
@@ -166,11 +169,12 @@ public class ParserChecker implements To
       scfilters.passScoreBeforeParsing(turl, cd, content);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e 
+ ")");
+        LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e
+            + ")");
         LOG.warn(StringUtils.stringifyException(e));
       }
-    }    
-    
+    }
+
     ParseResult parseResult = new ParseUtil(conf).parse(content);
 
     if (parseResult == null) {
@@ -179,8 +183,9 @@ public class ParserChecker implements To
     }
 
     // Calculate the signature
-    byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new 
Text(url)));
-    
+    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+        content, parseResult.get(new Text(url)));
+
     if (LOG.isInfoEnabled()) {
       LOG.info("parsing: " + url);
       LOG.info("contentType: " + contentType);
@@ -204,7 +209,8 @@ public class ParserChecker implements To
       scfilters.passScoreAfterParsing(turl, content, parse);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + 
")");
+        LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e
+            + ")");
         LOG.warn(StringUtils.stringifyException(e));
       }
     }

svn commit: r1655526 [7/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/pr...

Reply via email to