p...

lewismc Wed, 28 Jan 2015 21:40:02 -0800

Modified: nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Thu Jan 29 
05:38:59 2015
@@ -35,16 +35,15 @@ import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
-
 /** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
 public class DmozParser {
   public static final Logger LOG = LoggerFactory.getLogger(DmozParser.class);
-  
-    long pages = 0;
+
+  long pages = 0;
 
   /**
-   * This filter fixes characters that might offend our parser.
-   * This lets us be tolerant of errors that might appear in the input XML.
+   * This filter fixes characters that might offend our parser. This lets us be
+   * tolerant of errors that might appear in the input XML.
    */
   private static class XMLCharFilter extends FilterReader {
     private boolean lastBad = false;
@@ -56,9 +55,9 @@ public class DmozParser {
     public int read() throws IOException {
       int c = in.read();
       int value = c;
-      if (c != -1 && !(XMLChar.isValid(c)))     // fix invalid characters
+      if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
         value = 'X';
-      else if (lastBad && c == '<') {           // fix mis-matched brackets
+      else if (lastBad && c == '<') { // fix mis-matched brackets
         in.mark(1);
         if (in.read() != '/')
           value = 'X';
@@ -69,37 +68,35 @@ public class DmozParser {
       return value;
     }
 
-    public int read(char[] cbuf, int off, int len)
-      throws IOException {
+    public int read(char[] cbuf, int off, int len) throws IOException {
       int n = in.read(cbuf, off, len);
       if (n != -1) {
         for (int i = 0; i < n; i++) {
-          char c = cbuf[off+i];
+          char c = cbuf[off + i];
           char value = c;
-          if (!(XMLChar.isValid(c)))            // fix invalid characters
+          if (!(XMLChar.isValid(c))) // fix invalid characters
             value = 'X';
-          else if (lastBad && c == '<') {       // fix mis-matched brackets
-            if (i != n-1 && cbuf[off+i+1] != '/')
+          else if (lastBad && c == '<') { // fix mis-matched brackets
+            if (i != n - 1 && cbuf[off + i + 1] != '/')
               value = 'X';
           }
           lastBad = (c == 65533);
-          cbuf[off+i] = value;
+          cbuf[off + i] = value;
         }
       }
       return n;
     }
   }
 
-
   /**
-   * The RDFProcessor receives tag messages during a parse
-   * of RDF XML data.  We build whatever structures we need
-   * from these messages.
+   * The RDFProcessor receives tag messages during a parse of RDF XML data. We
+   * build whatever structures we need from these messages.
    */
   private class RDFProcessor extends DefaultHandler {
     String curURL = null, curSection = null;
-    boolean titlePending = false, descPending = false, insideAdultSection = 
false;
-    Pattern topicPattern = null; 
+    boolean titlePending = false, descPending = false,
+        insideAdultSection = false;
+    Pattern topicPattern = null;
     StringBuffer title = new StringBuffer(), desc = new StringBuffer();
     XMLReader reader;
     int subsetDenom;
@@ -108,10 +105,12 @@ public class DmozParser {
     Locator location;
 
     /**
-     * Pass in an XMLReader, plus a flag as to whether we 
-     * should include adult material.
+     * Pass in an XMLReader, plus a flag as to whether we should include adult
+     * material.
      */
-    public RDFProcessor(XMLReader reader, int subsetDenom, boolean 
includeAdult, int skew, Pattern topicPattern) throws IOException {
+    public RDFProcessor(XMLReader reader, int subsetDenom,
+        boolean includeAdult, int skew, Pattern topicPattern)
+        throws IOException {
       this.reader = reader;
       this.subsetDenom = subsetDenom;
       this.includeAdult = includeAdult;
@@ -127,20 +126,21 @@ public class DmozParser {
     /**
      * Start of an XML elt
      */
-    public void startElement(String namespaceURI, String localName, String 
qName, Attributes atts) throws SAXException {
+    public void startElement(String namespaceURI, String localName,
+        String qName, Attributes atts) throws SAXException {
       if ("Topic".equals(qName)) {
         curSection = atts.getValue("r:id");
       } else if ("ExternalPage".equals(qName)) {
         // Porn filter
-        if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
+        if ((!includeAdult) && curSection.startsWith("Top/Adult")) {
           return;
         }
-          
+
         if (topicPattern != null && 
!topicPattern.matcher(curSection).matches()) {
           return;
         }
 
-        // Subset denominator filter.  
+        // Subset denominator filter.
         // Only emit with a chance of 1/denominator.
         String url = atts.getValue("about");
         int hashValue = MD5Hash.digest(url).hashCode();
@@ -173,18 +173,18 @@ public class DmozParser {
      * Termination of XML elt
      */
     public void endElement(String namespaceURI, String localName, String qName)
-      throws SAXException {
+        throws SAXException {
       if (curURL != null) {
         if ("ExternalPage".equals(qName)) {
           //
-          // Inc the number of pages, insert the page, and 
+          // Inc the number of pages, insert the page, and
           // possibly print status.
           //
-          System.out.println(curURL); 
+          System.out.println(curURL);
           pages++;
 
           //
-          // Clear out the link text.  This is what
+          // Clear out the link text. This is what
           // you would use for adding to the linkdb.
           //
           if (title.length() > 0) {
@@ -219,15 +219,13 @@ public class DmozParser {
     }
 
     /**
-     * From time to time the Parser will set the "current location"
-     * by calling this function.  It's useful for emitting locations
-     * for error messages.
+     * From time to time the Parser will set the "current location" by calling
+     * this function. It's useful for emitting locations for error messages.
      */
     public void setDocumentLocator(Locator locator) {
       location = locator;
     }
 
-
     //
     // Interface ErrorHandler
     //
@@ -247,11 +245,11 @@ public class DmozParser {
     public void errorError(SAXParseException spe) {
       if (LOG.isErrorEnabled()) {
         LOG.error("Fatal err: " + spe.toString() + ": " + spe.getMessage());
-        LOG.error("Last known line is " + location.getLineNumber() +
-                  ", column " + location.getColumnNumber());
+        LOG.error("Last known line is " + location.getLineNumber()
+            + ", column " + location.getColumnNumber());
       }
     }
-        
+
     /**
      * Emit exception warning message
      */
@@ -263,34 +261,33 @@ public class DmozParser {
   }
 
   /**
-   * Iterate through all the items in this structured DMOZ file.
-   * Add each URL to the web db.
+   * Iterate through all the items in this structured DMOZ file. Add each URL 
to
+   * the web db.
    */
   public void parseDmozFile(File dmozFile, int subsetDenom,
-                            boolean includeAdult,
-                            int skew,
-                            Pattern topicPattern)
+      boolean includeAdult, int skew, Pattern topicPattern)
 
-    throws IOException, SAXException, ParserConfigurationException {
+  throws IOException, SAXException, ParserConfigurationException {
 
     SAXParserFactory parserFactory = SAXParserFactory.newInstance();
     SAXParser parser = parserFactory.newSAXParser();
     XMLReader reader = parser.getXMLReader();
 
     // Create our own processor to receive SAX events
-    RDFProcessor rp =
-      new RDFProcessor(reader, subsetDenom, includeAdult,
-                       skew, topicPattern);
+    RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew,
+        topicPattern);
     reader.setContentHandler(rp);
     reader.setErrorHandler(rp);
     LOG.info("skew = " + rp.hashSkew);
 
     //
-    // Open filtered text stream.  The TextFilter makes sure that
+    // Open filtered text stream. The TextFilter makes sure that
     // only appropriate XML-approved Text characters are received.
     // Any non-conforming characters are silently skipped.
     //
-    XMLCharFilter in = new XMLCharFilter(new BufferedReader(new 
InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), 
"UTF-8")));
+    XMLCharFilter in = new XMLCharFilter(new BufferedReader(
+        new InputStreamReader(new BufferedInputStream(new FileInputStream(
+            dmozFile)), "UTF-8")));
     try {
       InputSource is = new InputSource(in);
       reader.parse(is);
@@ -304,18 +301,17 @@ public class DmozParser {
     }
   }
 
-  private static void addTopicsFromFile(String topicFile,
-                                        Vector<String> topics)
-  throws IOException {
+  private static void addTopicsFromFile(String topicFile, Vector<String> 
topics)
+      throws IOException {
     BufferedReader in = null;
     try {
-      in = new BufferedReader(new InputStreamReader(new 
FileInputStream(topicFile), "UTF-8"));
+      in = new BufferedReader(new InputStreamReader(new FileInputStream(
+          topicFile), "UTF-8"));
       String line = null;
       while ((line = in.readLine()) != null) {
         topics.addElement(new String(line));
       }
-    } 
-    catch (Exception e) {
+    } catch (Exception e) {
       if (LOG.isErrorEnabled()) {
         LOG.error(e.toString());
       }
@@ -324,18 +320,19 @@ public class DmozParser {
       in.close();
     }
   }
-    
+
   /**
-   * Command-line access.  User may add URLs via a flat text file
-   * or the structured DMOZ file.  By default, we ignore Adult
-   * material (as categorized by DMOZ).
+   * Command-line access. User may add URLs via a flat text file or the
+   * structured DMOZ file. By default, we ignore Adult material (as categorized
+   * by DMOZ).
    */
   public static void main(String argv[]) throws Exception {
     if (argv.length < 1) {
-      System.err.println("Usage: DmozParser <dmoz_file> [-subset 
<subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic 
list file>] [-topic <topic> [-topic <topic> [...]]]");
+      System.err
+          .println("Usage: DmozParser <dmoz_file> [-subset 
<subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic 
list file>] [-topic <topic> [-topic <topic> [...]]]");
       return;
     }
-    
+
     //
     // Parse the command line, figure out what kind of
     // URL file we need to load
@@ -344,9 +341,9 @@ public class DmozParser {
     int skew = 0;
     String dmozFile = argv[0];
     boolean includeAdult = false;
-    Pattern topicPattern = null; 
+    Pattern topicPattern = null;
     Vector<String> topics = new Vector<String>();
-    
+
     Configuration conf = NutchConfiguration.create();
     FileSystem fs = FileSystem.get(conf);
     try {
@@ -354,16 +351,16 @@ public class DmozParser {
         if ("-includeAdultMaterial".equals(argv[i])) {
           includeAdult = true;
         } else if ("-subset".equals(argv[i])) {
-          subsetDenom = Integer.parseInt(argv[i+1]);
+          subsetDenom = Integer.parseInt(argv[i + 1]);
           i++;
         } else if ("-topic".equals(argv[i])) {
-          topics.addElement(argv[i+1]); 
+          topics.addElement(argv[i + 1]);
           i++;
         } else if ("-topicFile".equals(argv[i])) {
-          addTopicsFromFile(argv[i+1], topics);
+          addTopicsFromFile(argv[i + 1], topics);
           i++;
         } else if ("-skew".equals(argv[i])) {
-          skew = Integer.parseInt(argv[i+1]);
+          skew = Integer.parseInt(argv[i + 1]);
           i++;
         }
       }
@@ -371,21 +368,21 @@ public class DmozParser {
       DmozParser parser = new DmozParser();
 
       if (!topics.isEmpty()) {
-        String regExp = new String("^("); 
+        String regExp = new String("^(");
         int j = 0;
-        for ( ; j < topics.size() - 1; ++j) {
+        for (; j < topics.size() - 1; ++j) {
           regExp = regExp.concat(topics.get(j));
           regExp = regExp.concat("|");
         }
         regExp = regExp.concat(topics.get(j));
-        regExp = regExp.concat(").*"); 
+        regExp = regExp.concat(").*");
         LOG.info("Topic selection pattern = " + regExp);
-        topicPattern = Pattern.compile(regExp); 
+        topicPattern = Pattern.compile(regExp);
       }
 
-      parser.parseDmozFile(new File(dmozFile), subsetDenom,
-                           includeAdult, skew, topicPattern);
-      
+      parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew,
+          topicPattern);
+
     } finally {
       fs.close();
     }


Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Thu Jan 29 
05:38:59 2015
@@ -54,20 +54,26 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * <p>The file dumper tool enables one to reverse generate the raw content
- * from Nutch segment data directories. </p>
+ * <p>
+ * The file dumper tool enables one to reverse generate the raw content from
+ * Nutch segment data directories.
+ * </p>
  * <p>
  * The tool has a number of immediate uses:
  * <ol>
  * <li>one can see what a page looked like at the time it was crawled</li>
  * <li>one can see different media types acquired as part of the crawl</li>
- * <li>it enables us to see webpages before we augment them with additional 
metadata,
- * this can be handy for providing a provenance trail for your crawl data.</li>
+ * <li>it enables us to see webpages before we augment them with additional
+ * metadata, this can be handy for providing a provenance trail for your crawl
+ * data.</li>
  * </ol>
  * </p>
- * <p>Upon successful completion the tool displays a very convenient JSON 
snippet 
- * detailing the mimetype classifications and the counts of documents which 
- * fall into those classifications. An example is as follows:</p>
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which 
fall
+ * into those classifications. An example is as follows:
+ * </p>
+ * 
  * <pre>
  * {@code
  * INFO: File Types: 
@@ -92,45 +98,53 @@ import org.slf4j.LoggerFactory;
  *   }
  * }
  * </pre>
- * <p>In the case above the tool would have been run with the <b>-mimeType 
- * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b> 
+ * <p>
+ * In the case above the tool would have been run with the <b>-mimeType
+ * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
  * flag and corresponding values activated.
- *
+ * 
  */
 public class FileDumper {
 
   private static final Logger LOG = LoggerFactory.getLogger(FileDumper.class
       .getName());
 
-
   /**
-   * Dumps the reverse engineered raw content from the provided segment 
directories
-   * if a parent directory contains more than one segment, otherwise a single 
segment
-   * can be passed as an argument. 
-   * @param outputDir the directory you wish to dump the raw content to. This 
directory will be created.
-   * @param segmentRootDir a directory containing one or more segments.
-   * @param mimeTypes an array of mime types we have to dump, all others will 
be filtered out.
+   * Dumps the reverse engineered raw content from the provided segment
+   * directories if a parent directory contains more than one segment, 
otherwise
+   * a single segment can be passed as an argument.
+   * 
+   * @param outputDir
+   *          the directory you wish to dump the raw content to. This directory
+   *          will be created.
+   * @param segmentRootDir
+   *          a directory containing one or more segments.
+   * @param mimeTypes
+   *          an array of mime types we have to dump, all others will be
+   *          filtered out.
    * @throws Exception
    */
-  public void dump(File outputDir, File segmentRootDir, String[] mimeTypes) 
throws Exception {
-    if (mimeTypes == null) LOG.info("Accepting all mimetypes.");
-    //total file counts
+  public void dump(File outputDir, File segmentRootDir, String[] mimeTypes)
+      throws Exception {
+    if (mimeTypes == null)
+      LOG.info("Accepting all mimetypes.");
+    // total file counts
     Map<String, Integer> typeCounts = new HashMap<String, Integer>();
-    //filtered file counts
+    // filtered file counts
     Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
     Configuration conf = NutchConfiguration.create();
     FileSystem fs = FileSystem.get(conf);
     int fileCount = 0;
-    File[] segmentDirs = segmentRootDir
-        .listFiles(new FileFilter() {
+    File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
 
-          @Override
-          public boolean accept(File file) {
-            return file.canRead() && file.isDirectory();
-          }
-        });
+      @Override
+      public boolean accept(File file) {
+        return file.canRead() && file.isDirectory();
+      }
+    });
     if (segmentDirs == null) {
-      System.err.println("No segment directories found in [" + 
segmentRootDir.getAbsolutePath() + "]");
+      System.err.println("No segment directories found in ["
+          + segmentRootDir.getAbsolutePath() + "]");
       return;
     }
 
@@ -138,18 +152,17 @@ public class FileDumper {
       LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
       DataOutputStream doutputStream = null;
       try {
-        String segmentPath = segment.getAbsolutePath()
-            + "/" + Content.DIR_NAME + "/part-00000/data";
+        String segmentPath = segment.getAbsolutePath() + "/" + Content.DIR_NAME
+            + "/part-00000/data";
         Path file = new Path(segmentPath);
         if (!new File(file.toString()).exists()) {
           LOG.warn("Skipping segment: [" + segmentPath
               + "]: no data directory present");
           continue;
         }
-        SequenceFile.Reader reader = new SequenceFile.Reader(fs, file,
-            conf);
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
 
-        Writable key = (Writable)reader.getKeyClass().newInstance();
+        Writable key = (Writable) reader.getKeyClass().newInstance();
         Content content = null;
 
         while (reader.next(key)) {
@@ -158,35 +171,33 @@ public class FileDumper {
           String url = key.toString();
           String baseName = FilenameUtils.getBaseName(url);
           String extension = FilenameUtils.getExtension(url);
-          if (extension == null || (extension != null &&
-              extension.equals(""))){
+          if (extension == null || (extension != null && 
extension.equals(""))) {
             extension = "html";
           }
 
           String filename = baseName + "." + extension;
           ByteArrayInputStream bas = null;
           Boolean filter = false;
-          try{
+          try {
             bas = new ByteArrayInputStream(content.getContent());
             String mimeType = new Tika().detect(content.getContent());
             collectStats(typeCounts, mimeType);
             if (mimeType != null) {
-              if (mimeTypes == null || 
Arrays.asList(mimeTypes).contains(mimeType)) {
+              if (mimeTypes == null
+                  || Arrays.asList(mimeTypes).contains(mimeType)) {
                 collectStats(filteredCounts, mimeType);
                 filter = true;
               }
             }
-          }
-          catch(Exception e){
+          } catch (Exception e) {
             e.printStackTrace();
-            LOG.warn("Tika is unable to detect type for: ["+url+"]");
-          }
-          finally{
-            if(bas != null){
-              try{
+            LOG.warn("Tika is unable to detect type for: [" + url + "]");
+          } finally {
+            if (bas != null) {
+              try {
                 bas.close();
+              } catch (Exception ignore) {
               }
-              catch(Exception ignore){}
             }
           }
 
@@ -199,51 +210,58 @@ public class FileDumper {
               IOUtils.write(content.getContent(), output);
               fileCount++;
             } else {
-              LOG.info("Skipping writing: ["
-                  + outputFullPath + "]: file already exists");
+              LOG.info("Skipping writing: [" + outputFullPath
+                  + "]: file already exists");
             }
           }
         }
         reader.close();
-      }
-      finally {
+      } finally {
         fs.close();
-        if (doutputStream != null){
-          try{
+        if (doutputStream != null) {
+          try {
             doutputStream.close();
+          } catch (Exception ignore) {
           }
-          catch (Exception ignore){}
         }
       }
     }
-    LOG.info("Dumper File Stats: " + displayFileTypes(typeCounts, 
filteredCounts));
+    LOG.info("Dumper File Stats: "
+        + displayFileTypes(typeCounts, filteredCounts));
 
   }
 
   /**
    * Main method for invoking this tool
-   * @param args 1) output directory (which will be created) to host the
-   * raw data and 2) a directory containing one or more segments.
+   * 
+   * @param args
+   *          1) output directory (which will be created) to host the raw data
+   *          and 2) a directory containing one or more segments.
    * @throws Exception
    */
   public static void main(String[] args) throws Exception {
-    //boolean options
+    // boolean options
     Option helpOpt = new Option("h", "help", false, "show this help message");
-    //argument options
+    // argument options
     @SuppressWarnings("static-access")
-    Option outputOpt = OptionBuilder.withArgName("outputDir")
-    .hasArg().withDescription("output directory (which will be created) to 
host the raw data")
-    .create("outputDir");
+    Option outputOpt = OptionBuilder
+        .withArgName("outputDir")
+        .hasArg()
+        .withDescription(
+            "output directory (which will be created) to host the raw data")
+        .create("outputDir");
     @SuppressWarnings("static-access")
-    Option segOpt = OptionBuilder.withArgName("segment")
-    .hasArgs().withDescription("the segment(s) to use")
-    .create("segment");
+    Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+        .withDescription("the segment(s) to use").create("segment");
     @SuppressWarnings("static-access")
-    Option mimeOpt = OptionBuilder.withArgName("mimetype")
-    .hasArgs().withDescription("an optional list of mimetypes to dump, 
excluding all others. Defaults to all.")
-    .create("mimetype");
+    Option mimeOpt = OptionBuilder
+        .withArgName("mimetype")
+        .hasArgs()
+        .withDescription(
+            "an optional list of mimetypes to dump, excluding all others. 
Defaults to all.")
+        .create("mimetype");
 
-    //create the options
+    // create the options
     Options options = new Options();
     options.addOption(helpOpt);
     options.addOption(outputOpt);
@@ -267,13 +285,14 @@ public class FileDumper {
       if (!outputDir.exists()) {
         LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
             + "]: does not exist, creating it.");
-        if(!outputDir.mkdirs()) throw new Exception("Unable to create: 
["+outputDir.getAbsolutePath()+"]");
+        if (!outputDir.mkdirs())
+          throw new Exception("Unable to create: ["
+              + outputDir.getAbsolutePath() + "]");
       }
 
       FileDumper dumper = new FileDumper();
       dumper.dump(outputDir, segmentRootDir, mimeTypes);
-    }
-    catch(Exception e) {
+    } catch (Exception e) {
       LOG.error("FileDumper: " + StringUtils.stringifyException(e));
       e.printStackTrace();
       return;
@@ -282,13 +301,13 @@ public class FileDumper {
 
   private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
     typeCounts.put(mimeType,
-        typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1
-            : 1);
+        typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
   }
 
-  private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, 
Integer> filteredCounts) {
-    StringBuilder  builder = new StringBuilder();
-    //print total stats
+  private String displayFileTypes(Map<String, Integer> typeCounts,
+      Map<String, Integer> filteredCounts) {
+    StringBuilder builder = new StringBuilder();
+    // print total stats
     builder.append("\n  TOTAL Stats:\n");
     builder.append("                {\n");
     for (String mimeType : typeCounts.keySet()) {

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Thu Jan 29 
05:38:59 2015
@@ -54,19 +54,20 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.TimingUtil;
 
 /**
- * This tool generates fetchlists (segments to be fetched) from plain text
- * files containing one URL per line. It's useful when arbitrary URL-s need to
- * be fetched without adding them first to the CrawlDb, or during testing.
+ * This tool generates fetchlists (segments to be fetched) from plain text 
files
+ * containing one URL per line. It's useful when arbitrary URL-s need to be
+ * fetched without adding them first to the CrawlDb, or during testing.
  */
 public class FreeGenerator extends Configured implements Tool {
-  private static final Logger LOG = 
LoggerFactory.getLogger(FreeGenerator.class);
-  
+  private static final Logger LOG = LoggerFactory
+      .getLogger(FreeGenerator.class);
+
   private static final String FILTER_KEY = "free.generator.filter";
   private static final String NORMALIZE_KEY = "free.generator.normalize";
 
-  public static class FG extends MapReduceBase
-  implements Mapper<WritableComparable<?>, Text, Text, 
Generator.SelectorEntry>,
-  Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
+  public static class FG extends MapReduceBase implements
+      Mapper<WritableComparable<?>, Text, Text, Generator.SelectorEntry>,
+      Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
     private URLNormalizers normalizers = null;
     private URLFilters filters = null;
     private ScoringFilters scfilters;
@@ -89,13 +90,15 @@ public class FreeGenerator extends Confi
 
     Generator.SelectorEntry entry = new Generator.SelectorEntry();
 
-    public void map(WritableComparable<?> key, Text value, 
OutputCollector<Text,
-        Generator.SelectorEntry> output, Reporter reporter) throws IOException 
{
+    public void map(WritableComparable<?> key, Text value,
+        OutputCollector<Text, Generator.SelectorEntry> output, Reporter 
reporter)
+        throws IOException {
       // value is a line of text
       String urlString = value.toString();
       try {
         if (normalizers != null) {
-          urlString = normalizers.normalize(urlString, 
URLNormalizers.SCOPE_INJECT);
+          urlString = normalizers.normalize(urlString,
+              URLNormalizers.SCOPE_INJECT);
         }
         if (urlString != null && filters != null) {
           urlString = filters.filter(urlString);
@@ -105,7 +108,8 @@ public class FreeGenerator extends Confi
           scfilters.injectedScore(url, datum);
         }
       } catch (Exception e) {
-        LOG.warn("Error adding url '" + value.toString() + "', skipping: " + 
StringUtils.stringifyException(e));
+        LOG.warn("Error adding url '" + value.toString() + "', skipping: "
+            + StringUtils.stringifyException(e));
         return;
       }
       if (urlString == null) {
@@ -122,8 +126,10 @@ public class FreeGenerator extends Confi
     }
 
     public void reduce(Text key, Iterator<Generator.SelectorEntry> values,
-        OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws 
IOException {
-      // pick unique urls from values - discard the reduce key due to hash 
collisions
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      // pick unique urls from values - discard the reduce key due to hash
+      // collisions
       HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>();
       while (values.hasNext()) {
         Generator.SelectorEntry entry = values.next();
@@ -138,12 +144,17 @@ public class FreeGenerator extends Confi
 
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> 
[-filter] [-normalize]");
-      System.err.println("\tinputDir\tinput directory containing one or more 
input files.");
-      System.err.println("\t\tEach text file contains a list of URLs, one URL 
per line");
-      System.err.println("\tsegmentsDir\toutput directory, where new segment 
will be created");
+      System.err
+          .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] 
[-normalize]");
+      System.err
+          .println("\tinputDir\tinput directory containing one or more input 
files.");
+      System.err
+          .println("\t\tEach text file contains a list of URLs, one URL per 
line");
+      System.err
+          .println("\tsegmentsDir\toutput directory, where new segment will be 
created");
       System.err.println("\t-filter\trun current URLFilters on input URLs");
-      System.err.println("\t-normalize\trun current URLNormalizers on input 
URLs");
+      System.err
+          .println("\t-normalize\trun current URLNormalizers on input URLs");
       return -1;
     }
     boolean filter = false;
@@ -181,8 +192,8 @@ public class FreeGenerator extends Confi
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);
     job.setOutputKeyComparatorClass(Generator.HashComparator.class);
-    FileOutputFormat.setOutputPath(job, new Path(args[1],
-        new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
+    FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName,
+        CrawlDatum.GENERATE_DIR_NAME)));
     try {
       JobClient.runJob(job);
     } catch (Exception e) {
@@ -190,12 +201,14 @@ public class FreeGenerator extends Confi
       return -1;
     }
     long end = System.currentTimeMillis();
-    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + 
TimingUtil.elapsedTime(start, end));
+    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
     return 0;
   }
 
   public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(), 
args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(),
+        args);
     System.exit(res);
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Thu Jan 29 
05:38:59 2015
@@ -59,8 +59,7 @@ public class ResolveUrls {
   /**
    * A Thread which gets the ip address of a single host by name.
    */
-  private static class ResolverThread
-    extends Thread {
+  private static class ResolverThread extends Thread {
 
     private String url = null;
 
@@ -74,14 +73,13 @@ public class ResolveUrls {
       String host = URLUtil.getHost(url);
       long start = System.currentTimeMillis();
       try {
-        
-        // get the address by name and if no error is thrown then it 
+
+        // get the address by name and if no error is thrown then it
         // is resolved successfully
         InetAddress.getByName(host);
         LOG.info("Resolved: " + host);
         numResolved.incrementAndGet();
-      }
-      catch (Exception uhe) {
+      } catch (Exception uhe) {
         LOG.info("Error Resolving: " + host);
         numErrored.incrementAndGet();
       }
@@ -93,8 +91,8 @@ public class ResolveUrls {
   }
 
   /**
-   * Creates a thread pool for resolving urls.  Reads in the url file on the
-   * local filesystem.  For each url it attempts to resolve it keeping a total
+   * Creates a thread pool for resolving urls. Reads in the url file on the
+   * local filesystem. For each url it attempts to resolve it keeping a total
    * account of the number resolved, errored, and the amount of time.
    */
   public void resolveUrls() {
@@ -103,13 +101,13 @@ public class ResolveUrls {
 
       // create a thread pool with a fixed number of threads
       pool = Executors.newFixedThreadPool(numThreads);
-      
+
       // read in the urls file and loop through each line, one url per line
       BufferedReader buffRead = new BufferedReader(new FileReader(new File(
-        urlsFile)));
+          urlsFile)));
       String urlStr = null;
       while ((urlStr = buffRead.readLine()) != null) {
-        
+
         // spin up a resolver thread per url
         LOG.info("Starting: " + urlStr);
         pool.execute(new ResolverThread(urlStr));
@@ -119,9 +117,8 @@ public class ResolveUrls {
       // the thread pool to give urls time to finish resolving
       buffRead.close();
       pool.awaitTermination(60, TimeUnit.SECONDS);
-    }
-    catch (Exception e) {
-      
+    } catch (Exception e) {
+
       // on error shutdown the thread pool immediately
       pool.shutdownNow();
       LOG.info(StringUtils.stringifyException(e));
@@ -129,15 +126,16 @@ public class ResolveUrls {
 
     // shutdown the thread pool and log totals
     pool.shutdown();
-    LOG.info("Total: " + numTotal.get() + ", Resovled: "
-      + numResolved.get() + ", Errored: " + numErrored.get()
-      + ", Average Time: " + totalTime.get() / numTotal.get());
+    LOG.info("Total: " + numTotal.get() + ", Resovled: " + numResolved.get()
+        + ", Errored: " + numErrored.get() + ", Average Time: "
+        + totalTime.get() / numTotal.get());
   }
 
   /**
    * Create a new ResolveUrls with a file from the local file system.
-   *
-   * @param urlsFile The local urls file, one url per line.
+   * 
+   * @param urlsFile
+   *          The local urls file, one url per line.
    */
   public ResolveUrls(String urlsFile) {
     this(urlsFile, 100);
@@ -145,10 +143,12 @@ public class ResolveUrls {
 
   /**
    * Create a new ResolveUrls with a urls file and a number of threads for the
-   * Thread pool.  Number of threads is 100 by default.
+   * Thread pool. Number of threads is 100 by default.
    * 
-   * @param urlsFile The local urls file, one url per line.
-   * @param numThreads The number of threads used to resolve urls in parallel.
+   * @param urlsFile
+   *          The local urls file, one url per line.
+   * @param numThreads
+   *          The number of threads used to resolve urls in parallel.
    */
   public ResolveUrls(String urlsFile, int numThreads) {
     this.urlsFile = urlsFile;
@@ -165,13 +165,13 @@ public class ResolveUrls {
     OptionBuilder.withDescription("show this help message");
     Option helpOpts = OptionBuilder.create("help");
     options.addOption(helpOpts);
-    
+
     OptionBuilder.withArgName("urls");
     OptionBuilder.hasArg();
     OptionBuilder.withDescription("the urls file to check");
     Option urlOpts = OptionBuilder.create("urls");
     options.addOption(urlOpts);
-    
+
     OptionBuilder.withArgName("numThreads");
     OptionBuilder.hasArgs();
     OptionBuilder.withDescription("the number of threads to use");
@@ -197,8 +197,7 @@ public class ResolveUrls {
       }
       ResolveUrls resolve = new ResolveUrls(urls, numThreads);
       resolve.resolveUrls();
-    }
-    catch (Exception e) {
+    } catch (Exception e) {
       LOG.error("ResolveUrls: " + StringUtils.stringifyException(e));
     }
   }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java Thu Jan 
29 05:38:59 2015
@@ -30,21 +30,22 @@ import org.apache.hadoop.mapred.Reporter
 /**
  * A input format the reads arc files.
  */
-public class ArcInputFormat
-  extends FileInputFormat<Text, BytesWritable> {
+public class ArcInputFormat extends FileInputFormat<Text, BytesWritable> {
 
   /**
    * Returns the <code>RecordReader</code> for reading the arc file.
    * 
-   * @param split The InputSplit of the arc file to process.
-   * @param job The job configuration.
-   * @param reporter The progress reporter.
+   * @param split
+   *          The InputSplit of the arc file to process.
+   * @param job
+   *          The job configuration.
+   * @param reporter
+   *          The progress reporter.
    */
   public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split,
-      JobConf job, Reporter reporter)
-    throws IOException {
+      JobConf job, Reporter reporter) throws IOException {
     reporter.setStatus(split.toString());
-    return new ArcRecordReader(job, (FileSplit)split);
+    return new ArcRecordReader(job, (FileSplit) split);
   }
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Thu 
Jan 29 05:38:59 2015
@@ -34,23 +34,28 @@ import org.apache.hadoop.util.Reflection
 import org.apache.hadoop.util.StringUtils;
 
 /**
- * <p>The <code>ArchRecordReader</code> class provides a record reader which 
- * reads records from arc files.</p>
+ * <p>
+ * The <code>ArchRecordReader</code> class provides a record reader which reads
+ * records from arc files.
+ * </p>
  * 
- * <p>Arc files are essentially tars of gzips.  Each record in an arc file is
- * a compressed gzip.  Multiple records are concatenated together to form a
- * complete arc.  For more information on the arc file format see
- * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .</p>
+ * <p>
+ * Arc files are essentially tars of gzips. Each record in an arc file is a
+ * compressed gzip. Multiple records are concatenated together to form a
+ * complete arc. For more information on the arc file format see {@link http
+ * ://www.archive.org/web/researcher/ArcFileFormat.php } .
+ * </p>
  * 
- * <p>Arc files are used by the internet archive and grub projects.</p>
+ * <p>
+ * Arc files are used by the internet archive and grub projects.
+ * </p>
  * 
- * see {@link http://www.archive.org/ }
- * see {@link http://www.grub.org/ }
+ * see {@link http://www.archive.org/ } see {@link http://www.grub.org/ }
  */
-public class ArcRecordReader
-  implements RecordReader<Text, BytesWritable> {
+public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
 
-  public static final Logger LOG = 
LoggerFactory.getLogger(ArcRecordReader.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ArcRecordReader.class);
 
   protected Configuration conf;
   protected long splitStart = 0;
@@ -60,30 +65,32 @@ public class ArcRecordReader
   protected long fileLen = 0;
   protected FSDataInputStream in;
 
-  private static byte[] MAGIC = {(byte)0x1F, (byte)0x8B};
+  private static byte[] MAGIC = { (byte) 0x1F, (byte) 0x8B };
 
   /**
-   * <p>Returns true if the byte array passed matches the gzip header magic 
-   * number.</p>
+   * <p>
+   * Returns true if the byte array passed matches the gzip header magic 
number.
+   * </p>
    * 
-   * @param input The byte array to check.
+   * @param input
+   *          The byte array to check.
    * 
    * @return True if the byte array matches the gzip header magic number.
    */
   public static boolean isMagic(byte[] input) {
 
-       // check for null and incorrect length
+    // check for null and incorrect length
     if (input == null || input.length != MAGIC.length) {
       return false;
     }
-    
+
     // check byte by byte
     for (int i = 0; i < MAGIC.length; i++) {
       if (MAGIC[i] != input[i]) {
         return false;
       }
     }
-    
+
     // must match
     return true;
   }
@@ -91,13 +98,16 @@ public class ArcRecordReader
   /**
    * Constructor that sets the configuration and file split.
    * 
-   * @param conf The job configuration.
-   * @param split The file split to read from.
+   * @param conf
+   *          The job configuration.
+   * @param split
+   *          The file split to read from.
    * 
-   * @throws IOException  If an IO error occurs while initializing file split.
+   * @throws IOException
+   *           If an IO error occurs while initializing file split.
    */
   public ArcRecordReader(Configuration conf, FileSplit split)
-    throws IOException {
+      throws IOException {
 
     Path path = split.getPath();
     FileSystem fs = path.getFileSystem(conf);
@@ -113,8 +123,7 @@ public class ArcRecordReader
   /**
    * Closes the record reader resources.
    */
-  public void close()
-    throws IOException {
+  public void close() throws IOException {
     this.in.close();
   }
 
@@ -137,63 +146,64 @@ public class ArcRecordReader
    * 
    * @return The long of the current position in the file.
    */
-  public long getPos()
-    throws IOException {
+  public long getPos() throws IOException {
     return in.getPos();
   }
 
   /**
-   * Returns the percentage of progress in processing the file.  This will be
+   * Returns the percentage of progress in processing the file. This will be
    * represented as a float from 0 to 1 with 1 being 100% completed.
    * 
    * @return The percentage of progress as a float from 0 to 1.
    */
-  public float getProgress()
-    throws IOException {
-         
+  public float getProgress() throws IOException {
+
     // if we haven't even started
     if (splitEnd == splitStart) {
       return 0.0f;
-    }
-    else {
-      // the progress is current pos - where we started  / length of the split
-      return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen);
+    } else {
+      // the progress is current pos - where we started / length of the split
+      return Math.min(1.0f, (getPos() - splitStart) / (float) splitLen);
     }
   }
 
   /**
-   * <p>Returns true if the next record in the split is read into the key and 
-   * value pair.  The key will be the arc record header and the values will be
-   * the raw content bytes of the arc record.</p>
+   * <p>
+   * Returns true if the next record in the split is read into the key and 
value
+   * pair. The key will be the arc record header and the values will be the raw
+   * content bytes of the arc record.
+   * </p>
    * 
-   * @param key The record key
-   * @param value The record value
+   * @param key
+   *          The record key
+   * @param value
+   *          The record value
    * 
    * @return True if the next record is read.
    * 
-   * @throws IOException If an error occurs while reading the record value.
+   * @throws IOException
+   *           If an error occurs while reading the record value.
    */
-  public boolean next(Text key, BytesWritable value)
-    throws IOException {
+  public boolean next(Text key, BytesWritable value) throws IOException {
 
     try {
-      
+
       // get the starting position on the input stream
       long startRead = in.getPos();
       byte[] magicBuffer = null;
-      
+
       // we need this loop to handle false positives in reading of gzip records
       while (true) {
-        
+
         // while we haven't passed the end of the split
         if (startRead >= splitEnd) {
           return false;
         }
-        
+
         // scanning for the gzip header
         boolean foundStart = false;
         while (!foundStart) {
-          
+
           // start at the current file position and scan for 1K at time, break
           // if there is no more to read
           startRead = in.getPos();
@@ -202,13 +212,13 @@ public class ArcRecordReader
           if (read < 0) {
             break;
           }
-          
-          // scan the byte array for the gzip header magic number.  This 
happens
+
+          // scan the byte array for the gzip header magic number. This happens
           // byte by byte
           for (int i = 0; i < read - 1; i++) {
             byte[] testMagic = new byte[2];
-            System.arraycopy(magicBuffer, i, testMagic, 0, 2);            
-            if (isMagic(testMagic)) {              
+            System.arraycopy(magicBuffer, i, testMagic, 0, 2);
+            if (isMagic(testMagic)) {
               // set the next start to the current gzip header
               startRead += i;
               foundStart = true;
@@ -216,14 +226,14 @@ public class ArcRecordReader
             }
           }
         }
-        
+
         // seek to the start of the gzip header
         in.seek(startRead);
         ByteArrayOutputStream baos = null;
         int totalRead = 0;
 
         try {
-          
+
           // read 4K of the gzip at a time putting into a byte array
           byte[] buffer = new byte[4096];
           GZIPInputStream zin = new GZIPInputStream(in);
@@ -233,9 +243,8 @@ public class ArcRecordReader
             baos.write(buffer, 0, gzipRead);
             totalRead += gzipRead;
           }
-        }
-        catch (Exception e) {
-          
+        } catch (Exception e) {
+
           // there are times we get false positives where the gzip header 
exists
           // but it is not an actual gzip record, so we ignore it and start
           // over seeking
@@ -248,7 +257,7 @@ public class ArcRecordReader
 
         // change the output stream to a byte array
         byte[] content = baos.toByteArray();
-        
+
         // the first line of the raw content in arc files is the header
         int eol = 0;
         for (int i = 0; i < content.length; i++) {
@@ -257,34 +266,33 @@ public class ArcRecordReader
             break;
           }
         }
-        
+
         // create the header and the raw content minus the header
         String header = new String(content, 0, eol).trim();
         byte[] raw = new byte[(content.length - eol) - 1];
         System.arraycopy(content, eol + 1, raw, 0, raw.length);
-        
+
         // populate key and values with the header and raw content.
         Text keyText = key;
         keyText.set(header);
         BytesWritable valueBytes = value;
         valueBytes.set(raw, 0, raw.length);
 
-        // TODO: It would be best to start at the end of the gzip read but 
-        // the bytes read in gzip don't match raw bytes in the file so we 
-        // overshoot the next header.  With this current method you get
+        // TODO: It would be best to start at the end of the gzip read but
+        // the bytes read in gzip don't match raw bytes in the file so we
+        // overshoot the next header. With this current method you get
         // some false positives but don't miss records.
         if (startRead + 1 < fileLen) {
           in.seek(startRead + 1);
         }
-        
+
         // populated the record, now return
         return true;
       }
+    } catch (Exception e) {
+      LOG.equals(StringUtils.stringifyException(e));
     }
-    catch (Exception e) {
-      LOG.equals(StringUtils.stringifyException(e));      
-    }
-    
+
     // couldn't populate the record or there is no next record to read
     return false;
   }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Thu 
Jan 29 05:38:59 2015
@@ -61,18 +61,22 @@ import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.TimingUtil;
 
 /**
- * <p>The <code>ArcSegmentCreator</code> is a replacement for fetcher that will
- * take arc files as input and produce a nutch segment as output.</p>
+ * <p>
+ * The <code>ArcSegmentCreator</code> is a replacement for fetcher that will
+ * take arc files as input and produce a nutch segment as output.
+ * </p>
  * 
- * <p>Arc files are tars of compressed gzips which are produced by both the
- * internet archive project and the grub distributed crawler project.</p>
+ * <p>
+ * Arc files are tars of compressed gzips which are produced by both the
+ * internet archive project and the grub distributed crawler project.
+ * </p>
  * 
  */
-public class ArcSegmentCreator
-  extends Configured
-  implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> {
+public class ArcSegmentCreator extends Configured implements Tool,
+    Mapper<Text, BytesWritable, Text, NutchWritable> {
 
-  public static final Logger LOG = 
LoggerFactory.getLogger(ArcSegmentCreator.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ArcSegmentCreator.class);
   public static final String URL_VERSION = "arc.url.version";
   private JobConf jobConf;
   private URLFilters urlFilters;
@@ -88,7 +92,9 @@ public class ArcSegmentCreator
   }
 
   /**
-   * <p>Constructor that sets the job configuration.</p>
+   * <p>
+   * Constructor that sets the job configuration.
+   * </p>
    * 
    * @param conf
    */
@@ -104,17 +110,19 @@ public class ArcSegmentCreator
   public static synchronized String generateSegmentName() {
     try {
       Thread.sleep(1000);
-    }
-    catch (Throwable t) {
+    } catch (Throwable t) {
     }
     return sdf.format(new Date(System.currentTimeMillis()));
   }
 
   /**
-   * <p>Configures the job.  Sets the url filters, scoring filters, url 
normalizers
-   * and other relevant data.</p>
+   * <p>
+   * Configures the job. Sets the url filters, scoring filters, url normalizers
+   * and other relevant data.
+   * </p>
    * 
-   * @param job The job configuration.
+   * @param job
+   *          The job configuration.
    */
   public void configure(JobConf job) {
 
@@ -132,23 +140,31 @@ public class ArcSegmentCreator
   }
 
   /**
-   * <p>Parses the raw content of a single record to create output.  This 
method
-   * is almost the same as the {@link org.apache.nutch.Fetcher#output} method 
in
-   * terms of processing and output.  
+   * <p>
+   * Parses the raw content of a single record to create output. This method is
+   * almost the same as the {@link org.apache.nutch.Fetcher#output} method in
+   * terms of processing and output.
    * 
-   * @param output  The job output collector.
-   * @param segmentName The name of the segment to create.
-   * @param key The url of the record.
-   * @param datum The CrawlDatum of the record.
-   * @param content The raw content of the record
-   * @param pstatus The protocol status
-   * @param status The fetch status.
+   * @param output
+   *          The job output collector.
+   * @param segmentName
+   *          The name of the segment to create.
+   * @param key
+   *          The url of the record.
+   * @param datum
+   *          The CrawlDatum of the record.
+   * @param content
+   *          The raw content of the record
+   * @param pstatus
+   *          The protocol status
+   * @param status
+   *          The fetch status.
    * 
    * @return The result of the parse in a ParseStatus object.
    */
-  private ParseStatus output(OutputCollector<Text, NutchWritable> output, 
String segmentName,
-    Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus,
-    int status) {
+  private ParseStatus output(OutputCollector<Text, NutchWritable> output,
+      String segmentName, Text key, CrawlDatum datum, Content content,
+      ProtocolStatus pstatus, int status) {
 
     // set the fetch status and the fetch time
     datum.setStatus(status);
@@ -164,8 +180,7 @@ public class ArcSegmentCreator
       // add score to content metadata so that ParseSegment can pick it up.
       try {
         scfilters.passScoreBeforeParsing(key, datum, content);
-      }
-      catch (Exception e) {
+      } catch (Exception e) {
         if (LOG.isWarnEnabled()) {
           LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
         }
@@ -175,16 +190,15 @@ public class ArcSegmentCreator
 
         // parse the content
         parseResult = this.parseUtil.parse(content);
-      }
-      catch (Exception e) {
+      } catch (Exception e) {
         LOG.warn("Error parsing: " + key + ": "
-          + StringUtils.stringifyException(e));
+            + StringUtils.stringifyException(e));
       }
 
       // set the content signature
       if (parseResult == null) {
         byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
-          content, new ParseStatus().getEmptyParse(getConf()));
+            content, new ParseStatus().getEmptyParse(getConf()));
         datum.setSignature(signature);
       }
 
@@ -193,7 +207,7 @@ public class ArcSegmentCreator
         output.collect(key, new NutchWritable(content));
 
         if (parseResult != null) {
-          for (Entry <Text, Parse> entry : parseResult) {
+          for (Entry<Text, Parse> entry : parseResult) {
             Text url = entry.getKey();
             Parse parse = entry.getValue();
             ParseStatus parseStatus = parse.getData().getStatus();
@@ -203,35 +217,34 @@ public class ArcSegmentCreator
               parse = parseStatus.getEmptyParse(getConf());
             }
 
-            // Calculate page signature. 
-            byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(
-              content, parse);
+            // Calculate page signature.
+            byte[] signature = SignatureFactory.getSignature(getConf())
+                .calculate(content, parse);
             // Ensure segment name and score are in parseData metadata
-            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
-              segmentName);
-            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
-              StringUtil.toHexString(signature));
+            parse.getData().getContentMeta()
+                .set(Nutch.SEGMENT_NAME_KEY, segmentName);
+            parse.getData().getContentMeta()
+                .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
             // Pass fetch time to content meta
-            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
-              Long.toString(datum.getFetchTime()));
+            parse.getData().getContentMeta()
+                .set(Nutch.FETCH_TIME_KEY, 
Long.toString(datum.getFetchTime()));
             if (url.equals(key))
               datum.setSignature(signature);
             try {
               scfilters.passScoreAfterParsing(url, content, parse);
-            }
-            catch (Exception e) {
+            } catch (Exception e) {
               if (LOG.isWarnEnabled()) {
                 LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
               }
             }
             output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
-              parse.getText()), parse.getData(), parse.isCanonical())));
+                parse.getText()), parse.getData(), parse.isCanonical())));
           }
         }
-      }
-      catch (IOException e) {
+      } catch (IOException e) {
         if (LOG.isErrorEnabled()) {
-          LOG.error("ArcSegmentCreator caught:" + 
StringUtils.stringifyException(e));
+          LOG.error("ArcSegmentCreator caught:"
+              + StringUtils.stringifyException(e));
         }
       }
 
@@ -243,42 +256,51 @@ public class ArcSegmentCreator
         }
       }
     }
-    
+
     return null;
   }
 
   /**
-   * <p>Logs any error that occurs during conversion.</p>
+   * <p>
+   * Logs any error that occurs during conversion.
+   * </p>
    * 
-   * @param url The url we are parsing.
-   * @param t The error that occured.
+   * @param url
+   *          The url we are parsing.
+   * @param t
+   *          The error that occured.
    */
   private void logError(Text url, Throwable t) {
     if (LOG.isInfoEnabled()) {
-      LOG.info("Conversion of " + url + " failed with: " + 
-          StringUtils.stringifyException(t));
+      LOG.info("Conversion of " + url + " failed with: "
+          + StringUtils.stringifyException(t));
     }
   }
 
   /**
-   * <p>Runs the Map job to translate an arc record into output for Nutch 
-   * segments.</p>
+   * <p>
+   * Runs the Map job to translate an arc record into output for Nutch 
segments.
+   * </p>
    * 
-   * @param key The arc record header.
-   * @param bytes The arc record raw content bytes.
-   * @param output The output collecter.
-   * @param reporter The progress reporter.
+   * @param key
+   *          The arc record header.
+   * @param bytes
+   *          The arc record raw content bytes.
+   * @param output
+   *          The output collecter.
+   * @param reporter
+   *          The progress reporter.
    */
   public void map(Text key, BytesWritable bytes,
-    OutputCollector<Text, NutchWritable> output, Reporter reporter)
-    throws IOException {
+      OutputCollector<Text, NutchWritable> output, Reporter reporter)
+      throws IOException {
 
     String[] headers = key.toString().split("\\s+");
     String urlStr = headers[0];
     String version = headers[2];
     String contentType = headers[3];
-    
-    // arcs start with a file description.  for now we ignore this as it is not
+
+    // arcs start with a file description. for now we ignore this as it is not
     // a content record
     if (urlStr.startsWith("filedesc://")) {
       LOG.info("Ignoring file header: " + urlStr);
@@ -286,18 +308,17 @@ public class ArcSegmentCreator
     }
     LOG.info("Processing: " + urlStr);
 
-    // get the raw  bytes from the arc file, create a new crawldatum
+    // get the raw bytes from the arc file, create a new crawldatum
     Text url = new Text();
     CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
-      1.0f);
+        1.0f);
     String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);
 
     // normalize and filter the urls
     try {
       urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
       urlStr = urlFilters.filter(urlStr); // filter the url
-    }
-    catch (Exception e) {
+    } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Skipping " + url + ":" + e);
       }
@@ -312,37 +333,41 @@ public class ArcSegmentCreator
 
         // set the protocol status to success and the crawl status to success
         // create the content from the normalized url and the raw bytes from
-        // the arc file,  TODO: currently this doesn't handle text of errors
+        // the arc file, TODO: currently this doesn't handle text of errors
         // pages (i.e. 404, etc.). We assume we won't get those.
         ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
-        Content content = new Content(urlStr, urlStr, bytes.getBytes(), 
contentType,
-          new Metadata(), getConf());
-        
+        Content content = new Content(urlStr, urlStr, bytes.getBytes(),
+            contentType, new Metadata(), getConf());
+
         // set the url version into the metadata
         content.getMetadata().set(URL_VERSION, version);
         ParseStatus pstatus = null;
         pstatus = output(output, segmentName, url, datum, content, status,
-          CrawlDatum.STATUS_FETCH_SUCCESS);
+            CrawlDatum.STATUS_FETCH_SUCCESS);
         reporter.progress();
-      }
-      catch (Throwable t) { // unexpected exception
+      } catch (Throwable t) { // unexpected exception
         logError(url, t);
         output(output, segmentName, url, datum, null, null,
-          CrawlDatum.STATUS_FETCH_RETRY);
+            CrawlDatum.STATUS_FETCH_RETRY);
       }
     }
   }
 
   /**
-   * <p>Creates the arc files to segments job.</p>
+   * <p>
+   * Creates the arc files to segments job.
+   * </p>
    * 
-   * @param arcFiles The path to the directory holding the arc files
-   * @param segmentsOutDir The output directory for writing the segments
+   * @param arcFiles
+   *          The path to the directory holding the arc files
+   * @param segmentsOutDir
+   *          The output directory for writing the segments
    * 
-   * @throws IOException If an IO error occurs while running the job.
+   * @throws IOException
+   *           If an IO error occurs while running the job.
    */
   public void createSegments(Path arcFiles, Path segmentsOutDir)
-    throws IOException {
+      throws IOException {
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
@@ -366,17 +391,17 @@ public class ArcSegmentCreator
     JobClient.runJob(job);
 
     long end = System.currentTimeMillis();
-    LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + ", elapsed: 
" + TimingUtil.elapsedTime(start, end));
+    LOG.info("ArcSegmentCreator: finished at " + sdf.format(end)
+        + ", elapsed: " + TimingUtil.elapsedTime(start, end));
   }
 
-  public static void main(String args[])
-    throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new 
ArcSegmentCreator(), args);
+  public static void main(String args[]) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(),
+        new ArcSegmentCreator(), args);
     System.exit(res);
   }
 
-  public int run(String[] args)
-    throws Exception {
+  public int run(String[] args) throws Exception {
 
     String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>";
 
@@ -393,8 +418,7 @@ public class ArcSegmentCreator
       // create the segments from the arc files
       createSegments(arcFiles, segmentsOutDir);
       return 0;
-    }
-    catch (Exception e) {
+    } catch (Exception e) {
       LOG.error("ArcSegmentCreator: " + StringUtils.stringifyException(e));
       return -1;
     }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java Thu Jan 
29 05:38:59 2015
@@ -20,3 +20,4 @@
  * <a href="http://archive.org/web/researcher/ArcFileFormat.php";>Arc file 
format</a>.
  */
 package org.apache.nutch.tools.arc;
+

Modified: nutch/trunk/src/java/org/apache/nutch/tools/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/package-info.java Thu Jan 29 
05:38:59 2015
@@ -19,3 +19,4 @@
  * Miscellaneous tools.
  */
 package org.apache.nutch.tools;
+

Modified: nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java Thu Jan 29 
05:38:59 2015
@@ -82,11 +82,11 @@ public class CommandRunner {
   }
 
   public void evaluate() throws IOException {
-      this.exec();
+    this.exec();
   }
 
   /**
-   *
+   * 
    * @return process exit value (return code) or -1 if timed out.
    * @throws IOException
    */
@@ -94,13 +94,11 @@ public class CommandRunner {
     Process proc = Runtime.getRuntime().exec(_command);
     _barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0));
 
-    PullerThread so =
-      new PullerThread("STDOUT", proc.getInputStream(), _stdout);
+    PullerThread so = new PullerThread("STDOUT", proc.getInputStream(), 
_stdout);
     so.setDaemon(true);
     so.start();
 
-    PullerThread se =
-      new PullerThread("STDERR", proc.getErrorStream(), _stderr);
+    PullerThread se = new PullerThread("STDERR", proc.getErrorStream(), 
_stderr);
     se.setDaemon(true);
     se.start();
 
@@ -145,11 +143,11 @@ public class CommandRunner {
             Thread.sleep(1000);
             _xit = proc.exitValue();
           } catch (InterruptedException ie) {
-              if (Thread.interrupted()) {
-                  break; // stop waiting on an interrupt for this thread
-              } else {
-                  continue;
-              }
+            if (Thread.interrupted()) {
+              break; // stop waiting on an interrupt for this thread
+            } else {
+              continue;
+            }
           } catch (IllegalThreadStateException iltse) {
             continue;
           }
@@ -181,11 +179,8 @@ public class CommandRunner {
 
     private boolean _closeInput;
 
-    protected PumperThread(
-      String name,
-      InputStream is,
-      OutputStream os,
-      boolean closeInput) {
+    protected PumperThread(String name, InputStream is, OutputStream os,
+        boolean closeInput) {
       super(name);
       _is = is;
       _os = os;
@@ -218,12 +213,12 @@ public class CommandRunner {
         }
       }
       try {
-         _barrier.await();
-       } catch (InterruptedException ie) {
-         /* IGNORE */
-       } catch (BrokenBarrierException bbe) {
-         /* IGNORE */
-       }
+        _barrier.await();
+      } catch (InterruptedException ie) {
+        /* IGNORE */
+      } catch (BrokenBarrierException bbe) {
+        /* IGNORE */
+      }
     }
   }
 
@@ -269,8 +264,9 @@ public class CommandRunner {
 
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-timeout")) {
-        timeout = Integer.parseInt(args[++i]);;
-      } else if (i != args.length-2) {
+        timeout = Integer.parseInt(args[++i]);
+        ;
+      } else if (i != args.length - 2) {
         System.err.println(usage);
         System.exit(-1);
       } else {
@@ -290,6 +286,6 @@ public class CommandRunner {
 
     cr.evaluate();
 
-    System.err.println("output value: "+cr.getExitValue());
+    System.err.println("output value: " + cr.getExitValue());
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java Thu Jan 29 
05:38:59 2015
@@ -28,19 +28,18 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- *  A collection of utility methods for working on deflated data.
+ * A collection of utility methods for working on deflated data.
  */
 public class DeflateUtils {
-  
+
   private static final Logger LOG = 
LoggerFactory.getLogger(DeflateUtils.class);
   private static final int EXPECTED_COMPRESSION_RATIO = 5;
   private static final int BUF_SIZE = 4096;
 
   /**
-   * Returns an inflated copy of the input array.  If the deflated 
-   * input has been truncated or corrupted, a best-effort attempt is
-   * made to inflate as much as possible.  If no data can be extracted
-   * <code>null</code> is returned.
+   * Returns an inflated copy of the input array. If the deflated input has 
been
+   * truncated or corrupted, a best-effort attempt is made to inflate as much 
as
+   * possible. If no data can be extracted <code>null</code> is returned.
    */
   public static final byte[] inflateBestEffort(byte[] in) {
     return inflateBestEffort(in, Integer.MAX_VALUE);
@@ -48,37 +47,36 @@ public class DeflateUtils {
 
   /**
    * Returns an inflated copy of the input array, truncated to
-   * <code>sizeLimit</code> bytes, if necessary.  If the deflated input
-   * has been truncated or corrupted, a best-effort attempt is made to
-   * inflate as much as possible.  If no data can be extracted
-   * <code>null</code> is returned.
+   * <code>sizeLimit</code> bytes, if necessary. If the deflated input has been
+   * truncated or corrupted, a best-effort attempt is made to inflate as much 
as
+   * possible. If no data can be extracted <code>null</code> is returned.
    */
   public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) {
-    // decompress using InflaterInputStream 
-    ByteArrayOutputStream outStream = 
-      new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+    // decompress using InflaterInputStream
+    ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+        EXPECTED_COMPRESSION_RATIO * in.length);
 
     // "true" because HTTP does not provide zlib headers
     Inflater inflater = new Inflater(true);
-    InflaterInputStream inStream = 
-      new InflaterInputStream(new ByteArrayInputStream(in), inflater);
+    InflaterInputStream inStream = new InflaterInputStream(
+        new ByteArrayInputStream(in), inflater);
 
     byte[] buf = new byte[BUF_SIZE];
     int written = 0;
     while (true) {
       try {
-       int size = inStream.read(buf);
-       if (size <= 0) 
-         break;
-       if ((written + size) > sizeLimit) {
-         outStream.write(buf, 0, sizeLimit - written);
-         break;
-       }
-       outStream.write(buf, 0, size);
-       written+= size;
+        int size = inStream.read(buf);
+        if (size <= 0)
+          break;
+        if ((written + size) > sizeLimit) {
+          outStream.write(buf, 0, sizeLimit - written);
+          break;
+        }
+        outStream.write(buf, 0, size);
+        written += size;
       } catch (Exception e) {
-       LOG.info( "Caught Exception in inflateBestEffort", e );
-       break;
+        LOG.info("Caught Exception in inflateBestEffort", e);
+        break;
       }
     }
     try {
@@ -89,23 +87,24 @@ public class DeflateUtils {
     return outStream.toByteArray();
   }
 
-
   /**
-   * Returns an inflated copy of the input array.  
-   * @throws IOException if the input cannot be properly decompressed
+   * Returns an inflated copy of the input array.
+   * 
+   * @throws IOException
+   *           if the input cannot be properly decompressed
    */
   public static final byte[] inflate(byte[] in) throws IOException {
-    // decompress using InflaterInputStream 
-    ByteArrayOutputStream outStream = 
-      new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+    // decompress using InflaterInputStream
+    ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+        EXPECTED_COMPRESSION_RATIO * in.length);
 
-    InflaterInputStream inStream = 
-      new InflaterInputStream ( new ByteArrayInputStream(in) );
+    InflaterInputStream inStream = new InflaterInputStream(
+        new ByteArrayInputStream(in));
 
     byte[] buf = new byte[BUF_SIZE];
     while (true) {
       int size = inStream.read(buf);
-      if (size <= 0) 
+      if (size <= 0)
         break;
       outStream.write(buf, 0, size);
     }
@@ -118,9 +117,9 @@ public class DeflateUtils {
    * Returns a deflated copy of the input array.
    */
   public static final byte[] deflate(byte[] in) {
-    // compress using DeflaterOutputStream 
-    ByteArrayOutputStream byteOut = 
-      new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO);
+    // compress using DeflaterOutputStream
+    ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length
+        / EXPECTED_COMPRESSION_RATIO);
 
     DeflaterOutputStream outStream = new DeflaterOutputStream(byteOut);
 

Modified: nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java Thu Jan 29 05:38:59 
2015
@@ -38,7 +38,6 @@ import org.xml.sax.SAXException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-
 public class DomUtil {
 
   private final static Logger LOG = LoggerFactory.getLogger(DomUtil.class);
@@ -61,10 +60,10 @@ public class DomUtil {
       input.setEncoding("UTF-8");
       parser.parse(input);
       int i = 0;
-      while (! (parser.getDocument().getChildNodes().item(i) instanceof 
Element)) {
-       i++;
-      } 
-      element = (Element)parser.getDocument().getChildNodes().item(i);
+      while (!(parser.getDocument().getChildNodes().item(i) instanceof 
Element)) {
+        i++;
+      }
+      element = (Element) parser.getDocument().getChildNodes().item(i);
     } catch (FileNotFoundException e) {
       LOG.error("Error: ", e);
     } catch (SAXException e) {

Modified: nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java Thu Jan 29 
05:38:59 2015
@@ -39,27 +39,26 @@ import com.ibm.icu.text.CharsetMatch;
 
 /**
  * A simple class for detecting character encodings.
- *
+ * 
  * <p>
  * Broadly this encompasses two functions, which are distinctly separate:
- *
+ * 
  * <ol>
- *  <li>Auto detecting a set of "clues" from input text.</li>
- *  <li>Taking a set of clues and making a "best guess" as to the
- *      "real" encoding.</li>
+ * <li>Auto detecting a set of "clues" from input text.</li>
+ * <li>Taking a set of clues and making a "best guess" as to the "real"
+ * encoding.</li>
  * </ol>
  * </p>
- *
+ * 
  * <p>
- * A caller will often have some extra information about what the
- * encoding might be (e.g. from the HTTP header or HTML meta-tags, often
- * wrong but still potentially useful clues). The types of clues may differ
- * from caller to caller. Thus a typical calling sequence is:
+ * A caller will often have some extra information about what the encoding 
might
+ * be (e.g. from the HTTP header or HTML meta-tags, often wrong but still
+ * potentially useful clues). The types of clues may differ from caller to
+ * caller. Thus a typical calling sequence is:
  * <ul>
- *    <li>Run step (1) to generate a set of auto-detected clues;</li>
- *    <li>Combine these clues with the caller-dependent "extra clues"
- *        available;</li>
- *    <li>Run step (2) to guess what the most probable answer is.</li>
+ * <li>Run step (1) to generate a set of auto-detected clues;</li>
+ * <li>Combine these clues with the caller-dependent "extra clues" 
available;</li>
+ * <li>Run step (2) to guess what the most probable answer is.</li>
  * </p>
  */
 public class EncodingDetector {
@@ -89,34 +88,32 @@ public class EncodingDetector {
     }
 
     public String toString() {
-      return value + " (" + source +
-           ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
+      return value + " (" + source
+          + ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + 
")";
     }
 
     public boolean isEmpty() {
-      return (value==null || "".equals(value));
+      return (value == null || "".equals(value));
     }
 
     public boolean meetsThreshold() {
-      return (confidence < 0 ||
-               (minConfidence >= 0 && confidence >= minConfidence));
+      return (confidence < 0 || (minConfidence >= 0 && confidence >= 
minConfidence));
     }
   }
 
-  public static final Logger LOG = 
LoggerFactory.getLogger(EncodingDetector.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(EncodingDetector.class);
 
   public static final int NO_THRESHOLD = -1;
 
-  public static final String MIN_CONFIDENCE_KEY =
-    "encodingdetector.charset.min.confidence";
+  public static final String MIN_CONFIDENCE_KEY = 
"encodingdetector.charset.min.confidence";
 
-  private static final HashMap<String, String> ALIASES =
-    new HashMap<String, String>();
+  private static final HashMap<String, String> ALIASES = new HashMap<String, 
String>();
 
   private static final HashSet<String> DETECTABLES = new HashSet<String>();
 
   // CharsetDetector will die without a minimum amount of data.
-  private static final int MIN_LENGTH=4;
+  private static final int MIN_LENGTH = 4;
 
   static {
     DETECTABLES.add("text/html");
@@ -129,23 +126,22 @@ public class EncodingDetector {
     DETECTABLES.add("application/rss+xml");
     DETECTABLES.add("application/xhtml+xml");
     /*
-     * the following map is not an alias mapping table, but
-     * maps character encodings which are often used in mislabelled
-     * documents to their correct encodings. For instance,
-     * there are a lot of documents labelled 'ISO-8859-1' which contain
-     * characters not covered by ISO-8859-1 but covered by windows-1252.
-     * Because windows-1252 is a superset of ISO-8859-1 (sharing code points
-     * for the common part), it's better to treat ISO-8859-1 as
-     * synonymous with windows-1252 than to reject, as invalid, documents
-     * labelled as ISO-8859-1 that have characters outside ISO-8859-1.
+     * the following map is not an alias mapping table, but maps character
+     * encodings which are often used in mislabelled documents to their correct
+     * encodings. For instance, there are a lot of documents labelled
+     * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but
+     * covered by windows-1252. Because windows-1252 is a superset of 
ISO-8859-1
+     * (sharing code points for the common part), it's better to treat
+     * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid,
+     * documents labelled as ISO-8859-1 that have characters outside 
ISO-8859-1.
      */
     ALIASES.put("ISO-8859-1", "windows-1252");
     ALIASES.put("EUC-KR", "x-windows-949");
     ALIASES.put("x-EUC-CN", "GB18030");
     ALIASES.put("GBK", "GB18030");
-    //ALIASES.put("Big5", "Big5HKSCS");
-    //ALIASES.put("TIS620", "Cp874");
-    //ALIASES.put("ISO-8859-11", "Cp874");
+    // ALIASES.put("Big5", "Big5HKSCS");
+    // ALIASES.put("TIS620", "Cp874");
+    // ALIASES.put("ISO-8859-11", "Cp874");
 
   }
 
@@ -188,8 +184,9 @@ public class EncodingDetector {
     }
 
     // add character encoding coming from HTTP response header
-    addClue(parseCharacterEncoding(
-        content.getMetadata().get(Response.CONTENT_TYPE)), "header");
+    addClue(
+        
parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)),
+        "header");
   }
 
   public void addClue(String value, String source, int confidence) {
@@ -208,21 +205,23 @@ public class EncodingDetector {
 
   /**
    * Guess the encoding with the previously specified list of clues.
-   *
-   * @param content Content instance
-   * @param defaultValue Default encoding to return if no encoding can be
-   * detected with enough confidence. Note that this will <b>not</b> be
-   * normalized with {@link EncodingDetector#resolveEncodingAlias}
-   *
+   * 
+   * @param content
+   *          Content instance
+   * @param defaultValue
+   *          Default encoding to return if no encoding can be detected with
+   *          enough confidence. Note that this will <b>not</b> be normalized
+   *          with {@link EncodingDetector#resolveEncodingAlias}
+   * 
    * @return Guessed encoding or defaultValue
    */
   public String guessEncoding(Content content, String defaultValue) {
     /*
-     * This algorithm could be replaced by something more sophisticated;
-     * ideally we would gather a bunch of data on where various clues
-     * (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with
-     * the correct answer, and use machine learning/some statistical method
-     * to generate a better heuristic.
+     * This algorithm could be replaced by something more sophisticated; 
ideally
+     * we would gather a bunch of data on where various clues (autodetect, HTTP
+     * headers, HTML meta tags, etc.) disagree, tag each with the correct
+     * answer, and use machine learning/some statistical method to generate a
+     * better heuristic.
      */
 
     String base = content.getBaseUrl();
@@ -232,10 +231,9 @@ public class EncodingDetector {
     }
 
     /*
-     * Go down the list of encoding "clues". Use a clue if:
-     *  1. Has a confidence value which meets our confidence threshold, OR
-     *  2. Doesn't meet the threshold, but is the best try,
-     *     since nothing else is available.
+     * Go down the list of encoding "clues". Use a clue if: 1. Has a confidence
+     * value which meets our confidence threshold, OR 2. Doesn't meet the
+     * threshold, but is the best try, since nothing else is available.
      */
     EncodingClue defaultClue = new EncodingClue(defaultValue, "default");
     EncodingClue bestClue = defaultClue;
@@ -247,8 +245,8 @@ public class EncodingDetector {
       String charset = clue.value;
       if (minConfidence >= 0 && clue.confidence >= minConfidence) {
         if (LOG.isTraceEnabled()) {
-          LOG.trace(base + ": Choosing encoding: " + charset +
-                    " with confidence " + clue.confidence);
+          LOG.trace(base + ": Choosing encoding: " + charset
+              + " with confidence " + clue.confidence);
         }
         return resolveEncodingAlias(charset).toLowerCase();
       } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
@@ -268,10 +266,10 @@ public class EncodingDetector {
   }
 
   /*
-   * Strictly for analysis, look for "disagreements." The top guess from
-   * each source is examined; if these meet the threshold and disagree, then
-   * we log the information -- useful for testing or generating training data
-   * for a better heuristic.
+   * Strictly for analysis, look for "disagreements." The top guess from each
+   * source is examined; if these meet the threshold and disagree, then we log
+   * the information -- useful for testing or generating training data for a
+   * better heuristic.
    */
   private void findDisagreements(String url, List<EncodingClue> newClues) {
     HashSet<String> valsSeen = new HashSet<String>();
@@ -293,9 +291,9 @@ public class EncodingDetector {
     if (disagreement) {
       // dump all values in case of disagreement
       StringBuffer sb = new StringBuffer();
-      sb.append("Disagreement: "+url+"; ");
+      sb.append("Disagreement: " + url + "; ");
       for (int i = 0; i < newClues.size(); i++) {
-        if (i>0) {
+        if (i > 0) {
           sb.append(", ");
         }
         sb.append(newClues.get(i));
@@ -310,7 +308,7 @@ public class EncodingDetector {
         return null;
       String canonicalName = new String(Charset.forName(encoding).name());
       return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName)
-                                                : canonicalName;
+          : canonicalName;
     } catch (Exception e) {
       LOG.warn("Invalid encoding " + encoding + " detected, using default.");
       return null;
@@ -318,14 +316,14 @@ public class EncodingDetector {
   }
 
   /**
-   * Parse the character encoding from the specified content type header.
-   * If the content type is null, or there is no explicit character encoding,
-   * <code>null</code> is returned.
-   * <br />
-   * This method was copied from org.apache.catalina.util.RequestUtil,
-   * which is licensed under the Apache License, Version 2.0 (the "License").
-   *
-   * @param contentType a content type header
+   * Parse the character encoding from the specified content type header. If 
the
+   * content type is null, or there is no explicit character encoding,
+   * <code>null</code> is returned. <br />
+   * This method was copied from org.apache.catalina.util.RequestUtil, which is
+   * licensed under the Apache License, Version 2.0 (the "License").
+   * 
+   * @param contentType
+   *          a content type header
    */
   public static String parseCharacterEncoding(String contentType) {
     if (contentType == null)
@@ -339,7 +337,7 @@ public class EncodingDetector {
       encoding = encoding.substring(0, end);
     encoding = encoding.trim();
     if ((encoding.length() > 2) && (encoding.startsWith("\""))
-      && (encoding.endsWith("\"")))
+        && (encoding.endsWith("\"")))
       encoding = encoding.substring(1, encoding.length() - 1);
     return (encoding.trim());
 
@@ -352,12 +350,12 @@ public class EncodingDetector {
     }
 
     Configuration conf = NutchConfiguration.create();
-    EncodingDetector detector =
-      new EncodingDetector(NutchConfiguration.create());
+    EncodingDetector detector = new EncodingDetector(
+        NutchConfiguration.create());
 
     // do everything as bytes; don't want any conversion
-    BufferedInputStream istr =
-      new BufferedInputStream(new FileInputStream(args[0]));
+    BufferedInputStream istr = new BufferedInputStream(new FileInputStream(
+        args[0]));
     ByteArrayOutputStream ostr = new ByteArrayOutputStream();
     byte[] bytes = new byte[1000];
     boolean more = true;
@@ -376,8 +374,8 @@ public class EncodingDetector {
     byte[] data = ostr.toByteArray();
 
     // make a fake Content
-    Content content =
-      new Content("", "", data, "text/html", new Metadata(), conf);
+    Content content = new Content("", "", data, "text/html", new Metadata(),
+        conf);
 
     detector.autoDetectClues(content, true);
     String encoding = detector.guessEncoding(content,

svn commit: r1655526 [11/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Reply via email to