Modified: nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Thu Jan 29 05:38:59 2015 @@ -35,16 +35,15 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; - /** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */ public class DmozParser { public static final Logger LOG = LoggerFactory.getLogger(DmozParser.class); - - long pages = 0; + + long pages = 0; /** - * This filter fixes characters that might offend our parser. - * This lets us be tolerant of errors that might appear in the input XML. + * This filter fixes characters that might offend our parser. This lets us be + * tolerant of errors that might appear in the input XML. */ private static class XMLCharFilter extends FilterReader { private boolean lastBad = false; @@ -56,9 +55,9 @@ public class DmozParser { public int read() throws IOException { int c = in.read(); int value = c; - if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters + if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters value = 'X'; - else if (lastBad && c == '<') { // fix mis-matched brackets + else if (lastBad && c == '<') { // fix mis-matched brackets in.mark(1); if (in.read() != '/') value = 'X'; @@ -69,37 +68,35 @@ public class DmozParser { return value; } - public int read(char[] cbuf, int off, int len) - throws IOException { + public int read(char[] cbuf, int off, int len) throws IOException { int n = in.read(cbuf, off, len); if (n != -1) { for (int i = 0; i < n; i++) { - char c = cbuf[off+i]; + char c = cbuf[off + i]; char value = c; - if (!(XMLChar.isValid(c))) // fix invalid characters + if (!(XMLChar.isValid(c))) // fix invalid characters value = 'X'; - else if (lastBad && c == '<') { // fix mis-matched brackets - if (i != n-1 && cbuf[off+i+1] != '/') + else if (lastBad && c == '<') { // fix mis-matched brackets + if (i != n - 1 && cbuf[off + i + 1] != '/') value = 'X'; } lastBad = (c == 65533); - cbuf[off+i] = value; + cbuf[off + i] = value; } } return n; } } - /** - * The RDFProcessor receives tag messages during a parse - * of RDF XML data. We build whatever structures we need - * from these messages. + * The RDFProcessor receives tag messages during a parse of RDF XML data. We + * build whatever structures we need from these messages. */ private class RDFProcessor extends DefaultHandler { String curURL = null, curSection = null; - boolean titlePending = false, descPending = false, insideAdultSection = false; - Pattern topicPattern = null; + boolean titlePending = false, descPending = false, + insideAdultSection = false; + Pattern topicPattern = null; StringBuffer title = new StringBuffer(), desc = new StringBuffer(); XMLReader reader; int subsetDenom; @@ -108,10 +105,12 @@ public class DmozParser { Locator location; /** - * Pass in an XMLReader, plus a flag as to whether we - * should include adult material. + * Pass in an XMLReader, plus a flag as to whether we should include adult + * material. */ - public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern) throws IOException { + public RDFProcessor(XMLReader reader, int subsetDenom, + boolean includeAdult, int skew, Pattern topicPattern) + throws IOException { this.reader = reader; this.subsetDenom = subsetDenom; this.includeAdult = includeAdult; @@ -127,20 +126,21 @@ public class DmozParser { /** * Start of an XML elt */ - public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { + public void startElement(String namespaceURI, String localName, + String qName, Attributes atts) throws SAXException { if ("Topic".equals(qName)) { curSection = atts.getValue("r:id"); } else if ("ExternalPage".equals(qName)) { // Porn filter - if ((! includeAdult) && curSection.startsWith("Top/Adult")) { + if ((!includeAdult) && curSection.startsWith("Top/Adult")) { return; } - + if (topicPattern != null && !topicPattern.matcher(curSection).matches()) { return; } - // Subset denominator filter. + // Subset denominator filter. // Only emit with a chance of 1/denominator. String url = atts.getValue("about"); int hashValue = MD5Hash.digest(url).hashCode(); @@ -173,18 +173,18 @@ public class DmozParser { * Termination of XML elt */ public void endElement(String namespaceURI, String localName, String qName) - throws SAXException { + throws SAXException { if (curURL != null) { if ("ExternalPage".equals(qName)) { // - // Inc the number of pages, insert the page, and + // Inc the number of pages, insert the page, and // possibly print status. // - System.out.println(curURL); + System.out.println(curURL); pages++; // - // Clear out the link text. This is what + // Clear out the link text. This is what // you would use for adding to the linkdb. // if (title.length() > 0) { @@ -219,15 +219,13 @@ public class DmozParser { } /** - * From time to time the Parser will set the "current location" - * by calling this function. It's useful for emitting locations - * for error messages. + * From time to time the Parser will set the "current location" by calling + * this function. It's useful for emitting locations for error messages. */ public void setDocumentLocator(Locator locator) { location = locator; } - // // Interface ErrorHandler // @@ -247,11 +245,11 @@ public class DmozParser { public void errorError(SAXParseException spe) { if (LOG.isErrorEnabled()) { LOG.error("Fatal err: " + spe.toString() + ": " + spe.getMessage()); - LOG.error("Last known line is " + location.getLineNumber() + - ", column " + location.getColumnNumber()); + LOG.error("Last known line is " + location.getLineNumber() + + ", column " + location.getColumnNumber()); } } - + /** * Emit exception warning message */ @@ -263,34 +261,33 @@ public class DmozParser { } /** - * Iterate through all the items in this structured DMOZ file. - * Add each URL to the web db. + * Iterate through all the items in this structured DMOZ file. Add each URL to + * the web db. */ public void parseDmozFile(File dmozFile, int subsetDenom, - boolean includeAdult, - int skew, - Pattern topicPattern) + boolean includeAdult, int skew, Pattern topicPattern) - throws IOException, SAXException, ParserConfigurationException { + throws IOException, SAXException, ParserConfigurationException { SAXParserFactory parserFactory = SAXParserFactory.newInstance(); SAXParser parser = parserFactory.newSAXParser(); XMLReader reader = parser.getXMLReader(); // Create our own processor to receive SAX events - RDFProcessor rp = - new RDFProcessor(reader, subsetDenom, includeAdult, - skew, topicPattern); + RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew, + topicPattern); reader.setContentHandler(rp); reader.setErrorHandler(rp); LOG.info("skew = " + rp.hashSkew); // - // Open filtered text stream. The TextFilter makes sure that + // Open filtered text stream. The TextFilter makes sure that // only appropriate XML-approved Text characters are received. // Any non-conforming characters are silently skipped. // - XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8"))); + XMLCharFilter in = new XMLCharFilter(new BufferedReader( + new InputStreamReader(new BufferedInputStream(new FileInputStream( + dmozFile)), "UTF-8"))); try { InputSource is = new InputSource(in); reader.parse(is); @@ -304,18 +301,17 @@ public class DmozParser { } } - private static void addTopicsFromFile(String topicFile, - Vector<String> topics) - throws IOException { + private static void addTopicsFromFile(String topicFile, Vector<String> topics) + throws IOException { BufferedReader in = null; try { - in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8")); + in = new BufferedReader(new InputStreamReader(new FileInputStream( + topicFile), "UTF-8")); String line = null; while ((line = in.readLine()) != null) { topics.addElement(new String(line)); } - } - catch (Exception e) { + } catch (Exception e) { if (LOG.isErrorEnabled()) { LOG.error(e.toString()); } @@ -324,18 +320,19 @@ public class DmozParser { in.close(); } } - + /** - * Command-line access. User may add URLs via a flat text file - * or the structured DMOZ file. By default, we ignore Adult - * material (as categorized by DMOZ). + * Command-line access. User may add URLs via a flat text file or the + * structured DMOZ file. By default, we ignore Adult material (as categorized + * by DMOZ). */ public static void main(String argv[]) throws Exception { if (argv.length < 1) { - System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]"); + System.err + .println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]"); return; } - + // // Parse the command line, figure out what kind of // URL file we need to load @@ -344,9 +341,9 @@ public class DmozParser { int skew = 0; String dmozFile = argv[0]; boolean includeAdult = false; - Pattern topicPattern = null; + Pattern topicPattern = null; Vector<String> topics = new Vector<String>(); - + Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); try { @@ -354,16 +351,16 @@ public class DmozParser { if ("-includeAdultMaterial".equals(argv[i])) { includeAdult = true; } else if ("-subset".equals(argv[i])) { - subsetDenom = Integer.parseInt(argv[i+1]); + subsetDenom = Integer.parseInt(argv[i + 1]); i++; } else if ("-topic".equals(argv[i])) { - topics.addElement(argv[i+1]); + topics.addElement(argv[i + 1]); i++; } else if ("-topicFile".equals(argv[i])) { - addTopicsFromFile(argv[i+1], topics); + addTopicsFromFile(argv[i + 1], topics); i++; } else if ("-skew".equals(argv[i])) { - skew = Integer.parseInt(argv[i+1]); + skew = Integer.parseInt(argv[i + 1]); i++; } } @@ -371,21 +368,21 @@ public class DmozParser { DmozParser parser = new DmozParser(); if (!topics.isEmpty()) { - String regExp = new String("^("); + String regExp = new String("^("); int j = 0; - for ( ; j < topics.size() - 1; ++j) { + for (; j < topics.size() - 1; ++j) { regExp = regExp.concat(topics.get(j)); regExp = regExp.concat("|"); } regExp = regExp.concat(topics.get(j)); - regExp = regExp.concat(").*"); + regExp = regExp.concat(").*"); LOG.info("Topic selection pattern = " + regExp); - topicPattern = Pattern.compile(regExp); + topicPattern = Pattern.compile(regExp); } - parser.parseDmozFile(new File(dmozFile), subsetDenom, - includeAdult, skew, topicPattern); - + parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew, + topicPattern); + } finally { fs.close(); }
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Thu Jan 29 05:38:59 2015 @@ -54,20 +54,26 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * <p>The file dumper tool enables one to reverse generate the raw content - * from Nutch segment data directories. </p> + * <p> + * The file dumper tool enables one to reverse generate the raw content from + * Nutch segment data directories. + * </p> * <p> * The tool has a number of immediate uses: * <ol> * <li>one can see what a page looked like at the time it was crawled</li> * <li>one can see different media types acquired as part of the crawl</li> - * <li>it enables us to see webpages before we augment them with additional metadata, - * this can be handy for providing a provenance trail for your crawl data.</li> + * <li>it enables us to see webpages before we augment them with additional + * metadata, this can be handy for providing a provenance trail for your crawl + * data.</li> * </ol> * </p> - * <p>Upon successful completion the tool displays a very convenient JSON snippet - * detailing the mimetype classifications and the counts of documents which - * fall into those classifications. An example is as follows:</p> + * <p> + * Upon successful completion the tool displays a very convenient JSON snippet + * detailing the mimetype classifications and the counts of documents which fall + * into those classifications. An example is as follows: + * </p> + * * <pre> * {@code * INFO: File Types: @@ -92,45 +98,53 @@ import org.slf4j.LoggerFactory; * } * } * </pre> - * <p>In the case above the tool would have been run with the <b>-mimeType - * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b> + * <p> + * In the case above the tool would have been run with the <b>-mimeType + * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b> * flag and corresponding values activated. - * + * */ public class FileDumper { private static final Logger LOG = LoggerFactory.getLogger(FileDumper.class .getName()); - /** - * Dumps the reverse engineered raw content from the provided segment directories - * if a parent directory contains more than one segment, otherwise a single segment - * can be passed as an argument. - * @param outputDir the directory you wish to dump the raw content to. This directory will be created. - * @param segmentRootDir a directory containing one or more segments. - * @param mimeTypes an array of mime types we have to dump, all others will be filtered out. + * Dumps the reverse engineered raw content from the provided segment + * directories if a parent directory contains more than one segment, otherwise + * a single segment can be passed as an argument. + * + * @param outputDir + * the directory you wish to dump the raw content to. This directory + * will be created. + * @param segmentRootDir + * a directory containing one or more segments. + * @param mimeTypes + * an array of mime types we have to dump, all others will be + * filtered out. * @throws Exception */ - public void dump(File outputDir, File segmentRootDir, String[] mimeTypes) throws Exception { - if (mimeTypes == null) LOG.info("Accepting all mimetypes."); - //total file counts + public void dump(File outputDir, File segmentRootDir, String[] mimeTypes) + throws Exception { + if (mimeTypes == null) + LOG.info("Accepting all mimetypes."); + // total file counts Map<String, Integer> typeCounts = new HashMap<String, Integer>(); - //filtered file counts + // filtered file counts Map<String, Integer> filteredCounts = new HashMap<String, Integer>(); Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); int fileCount = 0; - File[] segmentDirs = segmentRootDir - .listFiles(new FileFilter() { + File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() { - @Override - public boolean accept(File file) { - return file.canRead() && file.isDirectory(); - } - }); + @Override + public boolean accept(File file) { + return file.canRead() && file.isDirectory(); + } + }); if (segmentDirs == null) { - System.err.println("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]"); + System.err.println("No segment directories found in [" + + segmentRootDir.getAbsolutePath() + "]"); return; } @@ -138,18 +152,17 @@ public class FileDumper { LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]"); DataOutputStream doutputStream = null; try { - String segmentPath = segment.getAbsolutePath() - + "/" + Content.DIR_NAME + "/part-00000/data"; + String segmentPath = segment.getAbsolutePath() + "/" + Content.DIR_NAME + + "/part-00000/data"; Path file = new Path(segmentPath); if (!new File(file.toString()).exists()) { LOG.warn("Skipping segment: [" + segmentPath + "]: no data directory present"); continue; } - SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, - conf); + SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf); - Writable key = (Writable)reader.getKeyClass().newInstance(); + Writable key = (Writable) reader.getKeyClass().newInstance(); Content content = null; while (reader.next(key)) { @@ -158,35 +171,33 @@ public class FileDumper { String url = key.toString(); String baseName = FilenameUtils.getBaseName(url); String extension = FilenameUtils.getExtension(url); - if (extension == null || (extension != null && - extension.equals(""))){ + if (extension == null || (extension != null && extension.equals(""))) { extension = "html"; } String filename = baseName + "." + extension; ByteArrayInputStream bas = null; Boolean filter = false; - try{ + try { bas = new ByteArrayInputStream(content.getContent()); String mimeType = new Tika().detect(content.getContent()); collectStats(typeCounts, mimeType); if (mimeType != null) { - if (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType)) { + if (mimeTypes == null + || Arrays.asList(mimeTypes).contains(mimeType)) { collectStats(filteredCounts, mimeType); filter = true; } } - } - catch(Exception e){ + } catch (Exception e) { e.printStackTrace(); - LOG.warn("Tika is unable to detect type for: ["+url+"]"); - } - finally{ - if(bas != null){ - try{ + LOG.warn("Tika is unable to detect type for: [" + url + "]"); + } finally { + if (bas != null) { + try { bas.close(); + } catch (Exception ignore) { } - catch(Exception ignore){} } } @@ -199,51 +210,58 @@ public class FileDumper { IOUtils.write(content.getContent(), output); fileCount++; } else { - LOG.info("Skipping writing: [" - + outputFullPath + "]: file already exists"); + LOG.info("Skipping writing: [" + outputFullPath + + "]: file already exists"); } } } reader.close(); - } - finally { + } finally { fs.close(); - if (doutputStream != null){ - try{ + if (doutputStream != null) { + try { doutputStream.close(); + } catch (Exception ignore) { } - catch (Exception ignore){} } } } - LOG.info("Dumper File Stats: " + displayFileTypes(typeCounts, filteredCounts)); + LOG.info("Dumper File Stats: " + + displayFileTypes(typeCounts, filteredCounts)); } /** * Main method for invoking this tool - * @param args 1) output directory (which will be created) to host the - * raw data and 2) a directory containing one or more segments. + * + * @param args + * 1) output directory (which will be created) to host the raw data + * and 2) a directory containing one or more segments. * @throws Exception */ public static void main(String[] args) throws Exception { - //boolean options + // boolean options Option helpOpt = new Option("h", "help", false, "show this help message"); - //argument options + // argument options @SuppressWarnings("static-access") - Option outputOpt = OptionBuilder.withArgName("outputDir") - .hasArg().withDescription("output directory (which will be created) to host the raw data") - .create("outputDir"); + Option outputOpt = OptionBuilder + .withArgName("outputDir") + .hasArg() + .withDescription( + "output directory (which will be created) to host the raw data") + .create("outputDir"); @SuppressWarnings("static-access") - Option segOpt = OptionBuilder.withArgName("segment") - .hasArgs().withDescription("the segment(s) to use") - .create("segment"); + Option segOpt = OptionBuilder.withArgName("segment").hasArgs() + .withDescription("the segment(s) to use").create("segment"); @SuppressWarnings("static-access") - Option mimeOpt = OptionBuilder.withArgName("mimetype") - .hasArgs().withDescription("an optional list of mimetypes to dump, excluding all others. Defaults to all.") - .create("mimetype"); + Option mimeOpt = OptionBuilder + .withArgName("mimetype") + .hasArgs() + .withDescription( + "an optional list of mimetypes to dump, excluding all others. Defaults to all.") + .create("mimetype"); - //create the options + // create the options Options options = new Options(); options.addOption(helpOpt); options.addOption(outputOpt); @@ -267,13 +285,14 @@ public class FileDumper { if (!outputDir.exists()) { LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it."); - if(!outputDir.mkdirs()) throw new Exception("Unable to create: ["+outputDir.getAbsolutePath()+"]"); + if (!outputDir.mkdirs()) + throw new Exception("Unable to create: [" + + outputDir.getAbsolutePath() + "]"); } FileDumper dumper = new FileDumper(); dumper.dump(outputDir, segmentRootDir, mimeTypes); - } - catch(Exception e) { + } catch (Exception e) { LOG.error("FileDumper: " + StringUtils.stringifyException(e)); e.printStackTrace(); return; @@ -282,13 +301,13 @@ public class FileDumper { private void collectStats(Map<String, Integer> typeCounts, String mimeType) { typeCounts.put(mimeType, - typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 - : 1); + typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1); } - private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) { - StringBuilder builder = new StringBuilder(); - //print total stats + private String displayFileTypes(Map<String, Integer> typeCounts, + Map<String, Integer> filteredCounts) { + StringBuilder builder = new StringBuilder(); + // print total stats builder.append("\n TOTAL Stats:\n"); builder.append(" {\n"); for (String mimeType : typeCounts.keySet()) { Modified: nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Thu Jan 29 05:38:59 2015 @@ -54,19 +54,20 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.TimingUtil; /** - * This tool generates fetchlists (segments to be fetched) from plain text - * files containing one URL per line. It's useful when arbitrary URL-s need to - * be fetched without adding them first to the CrawlDb, or during testing. + * This tool generates fetchlists (segments to be fetched) from plain text files + * containing one URL per line. It's useful when arbitrary URL-s need to be + * fetched without adding them first to the CrawlDb, or during testing. */ public class FreeGenerator extends Configured implements Tool { - private static final Logger LOG = LoggerFactory.getLogger(FreeGenerator.class); - + private static final Logger LOG = LoggerFactory + .getLogger(FreeGenerator.class); + private static final String FILTER_KEY = "free.generator.filter"; private static final String NORMALIZE_KEY = "free.generator.normalize"; - public static class FG extends MapReduceBase - implements Mapper<WritableComparable<?>, Text, Text, Generator.SelectorEntry>, - Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> { + public static class FG extends MapReduceBase implements + Mapper<WritableComparable<?>, Text, Text, Generator.SelectorEntry>, + Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> { private URLNormalizers normalizers = null; private URLFilters filters = null; private ScoringFilters scfilters; @@ -89,13 +90,15 @@ public class FreeGenerator extends Confi Generator.SelectorEntry entry = new Generator.SelectorEntry(); - public void map(WritableComparable<?> key, Text value, OutputCollector<Text, - Generator.SelectorEntry> output, Reporter reporter) throws IOException { + public void map(WritableComparable<?> key, Text value, + OutputCollector<Text, Generator.SelectorEntry> output, Reporter reporter) + throws IOException { // value is a line of text String urlString = value.toString(); try { if (normalizers != null) { - urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_INJECT); + urlString = normalizers.normalize(urlString, + URLNormalizers.SCOPE_INJECT); } if (urlString != null && filters != null) { urlString = filters.filter(urlString); @@ -105,7 +108,8 @@ public class FreeGenerator extends Confi scfilters.injectedScore(url, datum); } } catch (Exception e) { - LOG.warn("Error adding url '" + value.toString() + "', skipping: " + StringUtils.stringifyException(e)); + LOG.warn("Error adding url '" + value.toString() + "', skipping: " + + StringUtils.stringifyException(e)); return; } if (urlString == null) { @@ -122,8 +126,10 @@ public class FreeGenerator extends Confi } public void reduce(Text key, Iterator<Generator.SelectorEntry> values, - OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { - // pick unique urls from values - discard the reduce key due to hash collisions + OutputCollector<Text, CrawlDatum> output, Reporter reporter) + throws IOException { + // pick unique urls from values - discard the reduce key due to hash + // collisions HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>(); while (values.hasNext()) { Generator.SelectorEntry entry = values.next(); @@ -138,12 +144,17 @@ public class FreeGenerator extends Confi public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]"); - System.err.println("\tinputDir\tinput directory containing one or more input files."); - System.err.println("\t\tEach text file contains a list of URLs, one URL per line"); - System.err.println("\tsegmentsDir\toutput directory, where new segment will be created"); + System.err + .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]"); + System.err + .println("\tinputDir\tinput directory containing one or more input files."); + System.err + .println("\t\tEach text file contains a list of URLs, one URL per line"); + System.err + .println("\tsegmentsDir\toutput directory, where new segment will be created"); System.err.println("\t-filter\trun current URLFilters on input URLs"); - System.err.println("\t-normalize\trun current URLNormalizers on input URLs"); + System.err + .println("\t-normalize\trun current URLNormalizers on input URLs"); return -1; } boolean filter = false; @@ -181,8 +192,8 @@ public class FreeGenerator extends Confi job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(Generator.HashComparator.class); - FileOutputFormat.setOutputPath(job, new Path(args[1], - new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); + FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, + CrawlDatum.GENERATE_DIR_NAME))); try { JobClient.runJob(job); } catch (Exception e) { @@ -190,12 +201,14 @@ public class FreeGenerator extends Confi return -1; } long end = System.currentTimeMillis(); - LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); return 0; } public static void main(String[] args) throws Exception { - int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(), + args); System.exit(res); } } Modified: nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Thu Jan 29 05:38:59 2015 @@ -59,8 +59,7 @@ public class ResolveUrls { /** * A Thread which gets the ip address of a single host by name. */ - private static class ResolverThread - extends Thread { + private static class ResolverThread extends Thread { private String url = null; @@ -74,14 +73,13 @@ public class ResolveUrls { String host = URLUtil.getHost(url); long start = System.currentTimeMillis(); try { - - // get the address by name and if no error is thrown then it + + // get the address by name and if no error is thrown then it // is resolved successfully InetAddress.getByName(host); LOG.info("Resolved: " + host); numResolved.incrementAndGet(); - } - catch (Exception uhe) { + } catch (Exception uhe) { LOG.info("Error Resolving: " + host); numErrored.incrementAndGet(); } @@ -93,8 +91,8 @@ public class ResolveUrls { } /** - * Creates a thread pool for resolving urls. Reads in the url file on the - * local filesystem. For each url it attempts to resolve it keeping a total + * Creates a thread pool for resolving urls. Reads in the url file on the + * local filesystem. For each url it attempts to resolve it keeping a total * account of the number resolved, errored, and the amount of time. */ public void resolveUrls() { @@ -103,13 +101,13 @@ public class ResolveUrls { // create a thread pool with a fixed number of threads pool = Executors.newFixedThreadPool(numThreads); - + // read in the urls file and loop through each line, one url per line BufferedReader buffRead = new BufferedReader(new FileReader(new File( - urlsFile))); + urlsFile))); String urlStr = null; while ((urlStr = buffRead.readLine()) != null) { - + // spin up a resolver thread per url LOG.info("Starting: " + urlStr); pool.execute(new ResolverThread(urlStr)); @@ -119,9 +117,8 @@ public class ResolveUrls { // the thread pool to give urls time to finish resolving buffRead.close(); pool.awaitTermination(60, TimeUnit.SECONDS); - } - catch (Exception e) { - + } catch (Exception e) { + // on error shutdown the thread pool immediately pool.shutdownNow(); LOG.info(StringUtils.stringifyException(e)); @@ -129,15 +126,16 @@ public class ResolveUrls { // shutdown the thread pool and log totals pool.shutdown(); - LOG.info("Total: " + numTotal.get() + ", Resovled: " - + numResolved.get() + ", Errored: " + numErrored.get() - + ", Average Time: " + totalTime.get() / numTotal.get()); + LOG.info("Total: " + numTotal.get() + ", Resovled: " + numResolved.get() + + ", Errored: " + numErrored.get() + ", Average Time: " + + totalTime.get() / numTotal.get()); } /** * Create a new ResolveUrls with a file from the local file system. - * - * @param urlsFile The local urls file, one url per line. + * + * @param urlsFile + * The local urls file, one url per line. */ public ResolveUrls(String urlsFile) { this(urlsFile, 100); @@ -145,10 +143,12 @@ public class ResolveUrls { /** * Create a new ResolveUrls with a urls file and a number of threads for the - * Thread pool. Number of threads is 100 by default. + * Thread pool. Number of threads is 100 by default. * - * @param urlsFile The local urls file, one url per line. - * @param numThreads The number of threads used to resolve urls in parallel. + * @param urlsFile + * The local urls file, one url per line. + * @param numThreads + * The number of threads used to resolve urls in parallel. */ public ResolveUrls(String urlsFile, int numThreads) { this.urlsFile = urlsFile; @@ -165,13 +165,13 @@ public class ResolveUrls { OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("urls"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the urls file to check"); Option urlOpts = OptionBuilder.create("urls"); options.addOption(urlOpts); - + OptionBuilder.withArgName("numThreads"); OptionBuilder.hasArgs(); OptionBuilder.withDescription("the number of threads to use"); @@ -197,8 +197,7 @@ public class ResolveUrls { } ResolveUrls resolve = new ResolveUrls(urls, numThreads); resolve.resolveUrls(); - } - catch (Exception e) { + } catch (Exception e) { LOG.error("ResolveUrls: " + StringUtils.stringifyException(e)); } } Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java Thu Jan 29 05:38:59 2015 @@ -30,21 +30,22 @@ import org.apache.hadoop.mapred.Reporter /** * A input format the reads arc files. */ -public class ArcInputFormat - extends FileInputFormat<Text, BytesWritable> { +public class ArcInputFormat extends FileInputFormat<Text, BytesWritable> { /** * Returns the <code>RecordReader</code> for reading the arc file. * - * @param split The InputSplit of the arc file to process. - * @param job The job configuration. - * @param reporter The progress reporter. + * @param split + * The InputSplit of the arc file to process. + * @param job + * The job configuration. + * @param reporter + * The progress reporter. */ public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split, - JobConf job, Reporter reporter) - throws IOException { + JobConf job, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); - return new ArcRecordReader(job, (FileSplit)split); + return new ArcRecordReader(job, (FileSplit) split); } } Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Thu Jan 29 05:38:59 2015 @@ -34,23 +34,28 @@ import org.apache.hadoop.util.Reflection import org.apache.hadoop.util.StringUtils; /** - * <p>The <code>ArchRecordReader</code> class provides a record reader which - * reads records from arc files.</p> + * <p> + * The <code>ArchRecordReader</code> class provides a record reader which reads + * records from arc files. + * </p> * - * <p>Arc files are essentially tars of gzips. Each record in an arc file is - * a compressed gzip. Multiple records are concatenated together to form a - * complete arc. For more information on the arc file format see - * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .</p> + * <p> + * Arc files are essentially tars of gzips. Each record in an arc file is a + * compressed gzip. Multiple records are concatenated together to form a + * complete arc. For more information on the arc file format see {@link http + * ://www.archive.org/web/researcher/ArcFileFormat.php } . + * </p> * - * <p>Arc files are used by the internet archive and grub projects.</p> + * <p> + * Arc files are used by the internet archive and grub projects. + * </p> * - * see {@link http://www.archive.org/ } - * see {@link http://www.grub.org/ } + * see {@link http://www.archive.org/ } see {@link http://www.grub.org/ } */ -public class ArcRecordReader - implements RecordReader<Text, BytesWritable> { +public class ArcRecordReader implements RecordReader<Text, BytesWritable> { - public static final Logger LOG = LoggerFactory.getLogger(ArcRecordReader.class); + public static final Logger LOG = LoggerFactory + .getLogger(ArcRecordReader.class); protected Configuration conf; protected long splitStart = 0; @@ -60,30 +65,32 @@ public class ArcRecordReader protected long fileLen = 0; protected FSDataInputStream in; - private static byte[] MAGIC = {(byte)0x1F, (byte)0x8B}; + private static byte[] MAGIC = { (byte) 0x1F, (byte) 0x8B }; /** - * <p>Returns true if the byte array passed matches the gzip header magic - * number.</p> + * <p> + * Returns true if the byte array passed matches the gzip header magic number. + * </p> * - * @param input The byte array to check. + * @param input + * The byte array to check. * * @return True if the byte array matches the gzip header magic number. */ public static boolean isMagic(byte[] input) { - // check for null and incorrect length + // check for null and incorrect length if (input == null || input.length != MAGIC.length) { return false; } - + // check byte by byte for (int i = 0; i < MAGIC.length; i++) { if (MAGIC[i] != input[i]) { return false; } } - + // must match return true; } @@ -91,13 +98,16 @@ public class ArcRecordReader /** * Constructor that sets the configuration and file split. * - * @param conf The job configuration. - * @param split The file split to read from. + * @param conf + * The job configuration. + * @param split + * The file split to read from. * - * @throws IOException If an IO error occurs while initializing file split. + * @throws IOException + * If an IO error occurs while initializing file split. */ public ArcRecordReader(Configuration conf, FileSplit split) - throws IOException { + throws IOException { Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); @@ -113,8 +123,7 @@ public class ArcRecordReader /** * Closes the record reader resources. */ - public void close() - throws IOException { + public void close() throws IOException { this.in.close(); } @@ -137,63 +146,64 @@ public class ArcRecordReader * * @return The long of the current position in the file. */ - public long getPos() - throws IOException { + public long getPos() throws IOException { return in.getPos(); } /** - * Returns the percentage of progress in processing the file. This will be + * Returns the percentage of progress in processing the file. This will be * represented as a float from 0 to 1 with 1 being 100% completed. * * @return The percentage of progress as a float from 0 to 1. */ - public float getProgress() - throws IOException { - + public float getProgress() throws IOException { + // if we haven't even started if (splitEnd == splitStart) { return 0.0f; - } - else { - // the progress is current pos - where we started / length of the split - return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen); + } else { + // the progress is current pos - where we started / length of the split + return Math.min(1.0f, (getPos() - splitStart) / (float) splitLen); } } /** - * <p>Returns true if the next record in the split is read into the key and - * value pair. The key will be the arc record header and the values will be - * the raw content bytes of the arc record.</p> + * <p> + * Returns true if the next record in the split is read into the key and value + * pair. The key will be the arc record header and the values will be the raw + * content bytes of the arc record. + * </p> * - * @param key The record key - * @param value The record value + * @param key + * The record key + * @param value + * The record value * * @return True if the next record is read. * - * @throws IOException If an error occurs while reading the record value. + * @throws IOException + * If an error occurs while reading the record value. */ - public boolean next(Text key, BytesWritable value) - throws IOException { + public boolean next(Text key, BytesWritable value) throws IOException { try { - + // get the starting position on the input stream long startRead = in.getPos(); byte[] magicBuffer = null; - + // we need this loop to handle false positives in reading of gzip records while (true) { - + // while we haven't passed the end of the split if (startRead >= splitEnd) { return false; } - + // scanning for the gzip header boolean foundStart = false; while (!foundStart) { - + // start at the current file position and scan for 1K at time, break // if there is no more to read startRead = in.getPos(); @@ -202,13 +212,13 @@ public class ArcRecordReader if (read < 0) { break; } - - // scan the byte array for the gzip header magic number. This happens + + // scan the byte array for the gzip header magic number. This happens // byte by byte for (int i = 0; i < read - 1; i++) { byte[] testMagic = new byte[2]; - System.arraycopy(magicBuffer, i, testMagic, 0, 2); - if (isMagic(testMagic)) { + System.arraycopy(magicBuffer, i, testMagic, 0, 2); + if (isMagic(testMagic)) { // set the next start to the current gzip header startRead += i; foundStart = true; @@ -216,14 +226,14 @@ public class ArcRecordReader } } } - + // seek to the start of the gzip header in.seek(startRead); ByteArrayOutputStream baos = null; int totalRead = 0; try { - + // read 4K of the gzip at a time putting into a byte array byte[] buffer = new byte[4096]; GZIPInputStream zin = new GZIPInputStream(in); @@ -233,9 +243,8 @@ public class ArcRecordReader baos.write(buffer, 0, gzipRead); totalRead += gzipRead; } - } - catch (Exception e) { - + } catch (Exception e) { + // there are times we get false positives where the gzip header exists // but it is not an actual gzip record, so we ignore it and start // over seeking @@ -248,7 +257,7 @@ public class ArcRecordReader // change the output stream to a byte array byte[] content = baos.toByteArray(); - + // the first line of the raw content in arc files is the header int eol = 0; for (int i = 0; i < content.length; i++) { @@ -257,34 +266,33 @@ public class ArcRecordReader break; } } - + // create the header and the raw content minus the header String header = new String(content, 0, eol).trim(); byte[] raw = new byte[(content.length - eol) - 1]; System.arraycopy(content, eol + 1, raw, 0, raw.length); - + // populate key and values with the header and raw content. Text keyText = key; keyText.set(header); BytesWritable valueBytes = value; valueBytes.set(raw, 0, raw.length); - // TODO: It would be best to start at the end of the gzip read but - // the bytes read in gzip don't match raw bytes in the file so we - // overshoot the next header. With this current method you get + // TODO: It would be best to start at the end of the gzip read but + // the bytes read in gzip don't match raw bytes in the file so we + // overshoot the next header. With this current method you get // some false positives but don't miss records. if (startRead + 1 < fileLen) { in.seek(startRead + 1); } - + // populated the record, now return return true; } + } catch (Exception e) { + LOG.equals(StringUtils.stringifyException(e)); } - catch (Exception e) { - LOG.equals(StringUtils.stringifyException(e)); - } - + // couldn't populate the record or there is no next record to read return false; } Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Thu Jan 29 05:38:59 2015 @@ -61,18 +61,22 @@ import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.TimingUtil; /** - * <p>The <code>ArcSegmentCreator</code> is a replacement for fetcher that will - * take arc files as input and produce a nutch segment as output.</p> + * <p> + * The <code>ArcSegmentCreator</code> is a replacement for fetcher that will + * take arc files as input and produce a nutch segment as output. + * </p> * - * <p>Arc files are tars of compressed gzips which are produced by both the - * internet archive project and the grub distributed crawler project.</p> + * <p> + * Arc files are tars of compressed gzips which are produced by both the + * internet archive project and the grub distributed crawler project. + * </p> * */ -public class ArcSegmentCreator - extends Configured - implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> { +public class ArcSegmentCreator extends Configured implements Tool, + Mapper<Text, BytesWritable, Text, NutchWritable> { - public static final Logger LOG = LoggerFactory.getLogger(ArcSegmentCreator.class); + public static final Logger LOG = LoggerFactory + .getLogger(ArcSegmentCreator.class); public static final String URL_VERSION = "arc.url.version"; private JobConf jobConf; private URLFilters urlFilters; @@ -88,7 +92,9 @@ public class ArcSegmentCreator } /** - * <p>Constructor that sets the job configuration.</p> + * <p> + * Constructor that sets the job configuration. + * </p> * * @param conf */ @@ -104,17 +110,19 @@ public class ArcSegmentCreator public static synchronized String generateSegmentName() { try { Thread.sleep(1000); - } - catch (Throwable t) { + } catch (Throwable t) { } return sdf.format(new Date(System.currentTimeMillis())); } /** - * <p>Configures the job. Sets the url filters, scoring filters, url normalizers - * and other relevant data.</p> + * <p> + * Configures the job. Sets the url filters, scoring filters, url normalizers + * and other relevant data. + * </p> * - * @param job The job configuration. + * @param job + * The job configuration. */ public void configure(JobConf job) { @@ -132,23 +140,31 @@ public class ArcSegmentCreator } /** - * <p>Parses the raw content of a single record to create output. This method - * is almost the same as the {@link org.apache.nutch.Fetcher#output} method in - * terms of processing and output. + * <p> + * Parses the raw content of a single record to create output. This method is + * almost the same as the {@link org.apache.nutch.Fetcher#output} method in + * terms of processing and output. * - * @param output The job output collector. - * @param segmentName The name of the segment to create. - * @param key The url of the record. - * @param datum The CrawlDatum of the record. - * @param content The raw content of the record - * @param pstatus The protocol status - * @param status The fetch status. + * @param output + * The job output collector. + * @param segmentName + * The name of the segment to create. + * @param key + * The url of the record. + * @param datum + * The CrawlDatum of the record. + * @param content + * The raw content of the record + * @param pstatus + * The protocol status + * @param status + * The fetch status. * * @return The result of the parse in a ParseStatus object. */ - private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName, - Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, - int status) { + private ParseStatus output(OutputCollector<Text, NutchWritable> output, + String segmentName, Text key, CrawlDatum datum, Content content, + ProtocolStatus pstatus, int status) { // set the fetch status and the fetch time datum.setStatus(status); @@ -164,8 +180,7 @@ public class ArcSegmentCreator // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); - } - catch (Exception e) { + } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } @@ -175,16 +190,15 @@ public class ArcSegmentCreator // parse the content parseResult = this.parseUtil.parse(content); - } - catch (Exception e) { + } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " - + StringUtils.stringifyException(e)); + + StringUtils.stringifyException(e)); } // set the content signature if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate( - content, new ParseStatus().getEmptyParse(getConf())); + content, new ParseStatus().getEmptyParse(getConf())); datum.setSignature(signature); } @@ -193,7 +207,7 @@ public class ArcSegmentCreator output.collect(key, new NutchWritable(content)); if (parseResult != null) { - for (Entry <Text, Parse> entry : parseResult) { + for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); @@ -203,35 +217,34 @@ public class ArcSegmentCreator parse = parseStatus.getEmptyParse(getConf()); } - // Calculate page signature. - byte[] signature = SignatureFactory.getSignature(getConf()).calculate( - content, parse); + // Calculate page signature. + byte[] signature = SignatureFactory.getSignature(getConf()) + .calculate(content, parse); // Ensure segment name and score are in parseData metadata - parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, - segmentName); - parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, - StringUtil.toHexString(signature)); + parse.getData().getContentMeta() + .set(Nutch.SEGMENT_NAME_KEY, segmentName); + parse.getData().getContentMeta() + .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta - parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, - Long.toString(datum.getFetchTime())); + parse.getData().getContentMeta() + .set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); - } - catch (Exception e) { + } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } output.collect(url, new NutchWritable(new ParseImpl(new ParseText( - parse.getText()), parse.getData(), parse.isCanonical()))); + parse.getText()), parse.getData(), parse.isCanonical()))); } } - } - catch (IOException e) { + } catch (IOException e) { if (LOG.isErrorEnabled()) { - LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e)); + LOG.error("ArcSegmentCreator caught:" + + StringUtils.stringifyException(e)); } } @@ -243,42 +256,51 @@ public class ArcSegmentCreator } } } - + return null; } /** - * <p>Logs any error that occurs during conversion.</p> + * <p> + * Logs any error that occurs during conversion. + * </p> * - * @param url The url we are parsing. - * @param t The error that occured. + * @param url + * The url we are parsing. + * @param t + * The error that occured. */ private void logError(Text url, Throwable t) { if (LOG.isInfoEnabled()) { - LOG.info("Conversion of " + url + " failed with: " + - StringUtils.stringifyException(t)); + LOG.info("Conversion of " + url + " failed with: " + + StringUtils.stringifyException(t)); } } /** - * <p>Runs the Map job to translate an arc record into output for Nutch - * segments.</p> + * <p> + * Runs the Map job to translate an arc record into output for Nutch segments. + * </p> * - * @param key The arc record header. - * @param bytes The arc record raw content bytes. - * @param output The output collecter. - * @param reporter The progress reporter. + * @param key + * The arc record header. + * @param bytes + * The arc record raw content bytes. + * @param output + * The output collecter. + * @param reporter + * The progress reporter. */ public void map(Text key, BytesWritable bytes, - OutputCollector<Text, NutchWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, NutchWritable> output, Reporter reporter) + throws IOException { String[] headers = key.toString().split("\\s+"); String urlStr = headers[0]; String version = headers[2]; String contentType = headers[3]; - - // arcs start with a file description. for now we ignore this as it is not + + // arcs start with a file description. for now we ignore this as it is not // a content record if (urlStr.startsWith("filedesc://")) { LOG.info("Ignoring file header: " + urlStr); @@ -286,18 +308,17 @@ public class ArcSegmentCreator } LOG.info("Processing: " + urlStr); - // get the raw bytes from the arc file, create a new crawldatum + // get the raw bytes from the arc file, create a new crawldatum Text url = new Text(); CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval, - 1.0f); + 1.0f); String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY); // normalize and filter the urls try { urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER); urlStr = urlFilters.filter(urlStr); // filter the url - } - catch (Exception e) { + } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } @@ -312,37 +333,41 @@ public class ArcSegmentCreator // set the protocol status to success and the crawl status to success // create the content from the normalized url and the raw bytes from - // the arc file, TODO: currently this doesn't handle text of errors + // the arc file, TODO: currently this doesn't handle text of errors // pages (i.e. 404, etc.). We assume we won't get those. ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS; - Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType, - new Metadata(), getConf()); - + Content content = new Content(urlStr, urlStr, bytes.getBytes(), + contentType, new Metadata(), getConf()); + // set the url version into the metadata content.getMetadata().set(URL_VERSION, version); ParseStatus pstatus = null; pstatus = output(output, segmentName, url, datum, content, status, - CrawlDatum.STATUS_FETCH_SUCCESS); + CrawlDatum.STATUS_FETCH_SUCCESS); reporter.progress(); - } - catch (Throwable t) { // unexpected exception + } catch (Throwable t) { // unexpected exception logError(url, t); output(output, segmentName, url, datum, null, null, - CrawlDatum.STATUS_FETCH_RETRY); + CrawlDatum.STATUS_FETCH_RETRY); } } } /** - * <p>Creates the arc files to segments job.</p> + * <p> + * Creates the arc files to segments job. + * </p> * - * @param arcFiles The path to the directory holding the arc files - * @param segmentsOutDir The output directory for writing the segments + * @param arcFiles + * The path to the directory holding the arc files + * @param segmentsOutDir + * The output directory for writing the segments * - * @throws IOException If an IO error occurs while running the job. + * @throws IOException + * If an IO error occurs while running the job. */ public void createSegments(Path arcFiles, Path segmentsOutDir) - throws IOException { + throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -366,17 +391,17 @@ public class ArcSegmentCreator JobClient.runJob(job); long end = System.currentTimeMillis(); - LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } - public static void main(String args[]) - throws Exception { - int res = ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), args); + public static void main(String args[]) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), + new ArcSegmentCreator(), args); System.exit(res); } - public int run(String[] args) - throws Exception { + public int run(String[] args) throws Exception { String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>"; @@ -393,8 +418,7 @@ public class ArcSegmentCreator // create the segments from the arc files createSegments(arcFiles, segmentsOutDir); return 0; - } - catch (Exception e) { + } catch (Exception e) { LOG.error("ArcSegmentCreator: " + StringUtils.stringifyException(e)); return -1; } Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * <a href="http://archive.org/web/researcher/ArcFileFormat.php">Arc file format</a>. */ package org.apache.nutch.tools.arc; + Modified: nutch/trunk/src/java/org/apache/nutch/tools/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/package-info.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Miscellaneous tools. */ package org.apache.nutch.tools; + Modified: nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java Thu Jan 29 05:38:59 2015 @@ -82,11 +82,11 @@ public class CommandRunner { } public void evaluate() throws IOException { - this.exec(); + this.exec(); } /** - * + * * @return process exit value (return code) or -1 if timed out. * @throws IOException */ @@ -94,13 +94,11 @@ public class CommandRunner { Process proc = Runtime.getRuntime().exec(_command); _barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0)); - PullerThread so = - new PullerThread("STDOUT", proc.getInputStream(), _stdout); + PullerThread so = new PullerThread("STDOUT", proc.getInputStream(), _stdout); so.setDaemon(true); so.start(); - PullerThread se = - new PullerThread("STDERR", proc.getErrorStream(), _stderr); + PullerThread se = new PullerThread("STDERR", proc.getErrorStream(), _stderr); se.setDaemon(true); se.start(); @@ -145,11 +143,11 @@ public class CommandRunner { Thread.sleep(1000); _xit = proc.exitValue(); } catch (InterruptedException ie) { - if (Thread.interrupted()) { - break; // stop waiting on an interrupt for this thread - } else { - continue; - } + if (Thread.interrupted()) { + break; // stop waiting on an interrupt for this thread + } else { + continue; + } } catch (IllegalThreadStateException iltse) { continue; } @@ -181,11 +179,8 @@ public class CommandRunner { private boolean _closeInput; - protected PumperThread( - String name, - InputStream is, - OutputStream os, - boolean closeInput) { + protected PumperThread(String name, InputStream is, OutputStream os, + boolean closeInput) { super(name); _is = is; _os = os; @@ -218,12 +213,12 @@ public class CommandRunner { } } try { - _barrier.await(); - } catch (InterruptedException ie) { - /* IGNORE */ - } catch (BrokenBarrierException bbe) { - /* IGNORE */ - } + _barrier.await(); + } catch (InterruptedException ie) { + /* IGNORE */ + } catch (BrokenBarrierException bbe) { + /* IGNORE */ + } } } @@ -269,8 +264,9 @@ public class CommandRunner { for (int i = 0; i < args.length; i++) { if (args[i].equals("-timeout")) { - timeout = Integer.parseInt(args[++i]);; - } else if (i != args.length-2) { + timeout = Integer.parseInt(args[++i]); + ; + } else if (i != args.length - 2) { System.err.println(usage); System.exit(-1); } else { @@ -290,6 +286,6 @@ public class CommandRunner { cr.evaluate(); - System.err.println("output value: "+cr.getExitValue()); + System.err.println("output value: " + cr.getExitValue()); } } Modified: nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java Thu Jan 29 05:38:59 2015 @@ -28,19 +28,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * A collection of utility methods for working on deflated data. + * A collection of utility methods for working on deflated data. */ public class DeflateUtils { - + private static final Logger LOG = LoggerFactory.getLogger(DeflateUtils.class); private static final int EXPECTED_COMPRESSION_RATIO = 5; private static final int BUF_SIZE = 4096; /** - * Returns an inflated copy of the input array. If the deflated - * input has been truncated or corrupted, a best-effort attempt is - * made to inflate as much as possible. If no data can be extracted - * <code>null</code> is returned. + * Returns an inflated copy of the input array. If the deflated input has been + * truncated or corrupted, a best-effort attempt is made to inflate as much as + * possible. If no data can be extracted <code>null</code> is returned. */ public static final byte[] inflateBestEffort(byte[] in) { return inflateBestEffort(in, Integer.MAX_VALUE); @@ -48,37 +47,36 @@ public class DeflateUtils { /** * Returns an inflated copy of the input array, truncated to - * <code>sizeLimit</code> bytes, if necessary. If the deflated input - * has been truncated or corrupted, a best-effort attempt is made to - * inflate as much as possible. If no data can be extracted - * <code>null</code> is returned. + * <code>sizeLimit</code> bytes, if necessary. If the deflated input has been + * truncated or corrupted, a best-effort attempt is made to inflate as much as + * possible. If no data can be extracted <code>null</code> is returned. */ public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) { - // decompress using InflaterInputStream - ByteArrayOutputStream outStream = - new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + // decompress using InflaterInputStream + ByteArrayOutputStream outStream = new ByteArrayOutputStream( + EXPECTED_COMPRESSION_RATIO * in.length); // "true" because HTTP does not provide zlib headers Inflater inflater = new Inflater(true); - InflaterInputStream inStream = - new InflaterInputStream(new ByteArrayInputStream(in), inflater); + InflaterInputStream inStream = new InflaterInputStream( + new ByteArrayInputStream(in), inflater); byte[] buf = new byte[BUF_SIZE]; int written = 0; while (true) { try { - int size = inStream.read(buf); - if (size <= 0) - break; - if ((written + size) > sizeLimit) { - outStream.write(buf, 0, sizeLimit - written); - break; - } - outStream.write(buf, 0, size); - written+= size; + int size = inStream.read(buf); + if (size <= 0) + break; + if ((written + size) > sizeLimit) { + outStream.write(buf, 0, sizeLimit - written); + break; + } + outStream.write(buf, 0, size); + written += size; } catch (Exception e) { - LOG.info( "Caught Exception in inflateBestEffort", e ); - break; + LOG.info("Caught Exception in inflateBestEffort", e); + break; } } try { @@ -89,23 +87,24 @@ public class DeflateUtils { return outStream.toByteArray(); } - /** - * Returns an inflated copy of the input array. - * @throws IOException if the input cannot be properly decompressed + * Returns an inflated copy of the input array. + * + * @throws IOException + * if the input cannot be properly decompressed */ public static final byte[] inflate(byte[] in) throws IOException { - // decompress using InflaterInputStream - ByteArrayOutputStream outStream = - new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + // decompress using InflaterInputStream + ByteArrayOutputStream outStream = new ByteArrayOutputStream( + EXPECTED_COMPRESSION_RATIO * in.length); - InflaterInputStream inStream = - new InflaterInputStream ( new ByteArrayInputStream(in) ); + InflaterInputStream inStream = new InflaterInputStream( + new ByteArrayInputStream(in)); byte[] buf = new byte[BUF_SIZE]; while (true) { int size = inStream.read(buf); - if (size <= 0) + if (size <= 0) break; outStream.write(buf, 0, size); } @@ -118,9 +117,9 @@ public class DeflateUtils { * Returns a deflated copy of the input array. */ public static final byte[] deflate(byte[] in) { - // compress using DeflaterOutputStream - ByteArrayOutputStream byteOut = - new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO); + // compress using DeflaterOutputStream + ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length + / EXPECTED_COMPRESSION_RATIO); DeflaterOutputStream outStream = new DeflaterOutputStream(byteOut); Modified: nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java Thu Jan 29 05:38:59 2015 @@ -38,7 +38,6 @@ import org.xml.sax.SAXException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - public class DomUtil { private final static Logger LOG = LoggerFactory.getLogger(DomUtil.class); @@ -61,10 +60,10 @@ public class DomUtil { input.setEncoding("UTF-8"); parser.parse(input); int i = 0; - while (! (parser.getDocument().getChildNodes().item(i) instanceof Element)) { - i++; - } - element = (Element)parser.getDocument().getChildNodes().item(i); + while (!(parser.getDocument().getChildNodes().item(i) instanceof Element)) { + i++; + } + element = (Element) parser.getDocument().getChildNodes().item(i); } catch (FileNotFoundException e) { LOG.error("Error: ", e); } catch (SAXException e) { Modified: nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java Thu Jan 29 05:38:59 2015 @@ -39,27 +39,26 @@ import com.ibm.icu.text.CharsetMatch; /** * A simple class for detecting character encodings. - * + * * <p> * Broadly this encompasses two functions, which are distinctly separate: - * + * * <ol> - * <li>Auto detecting a set of "clues" from input text.</li> - * <li>Taking a set of clues and making a "best guess" as to the - * "real" encoding.</li> + * <li>Auto detecting a set of "clues" from input text.</li> + * <li>Taking a set of clues and making a "best guess" as to the "real" + * encoding.</li> * </ol> * </p> - * + * * <p> - * A caller will often have some extra information about what the - * encoding might be (e.g. from the HTTP header or HTML meta-tags, often - * wrong but still potentially useful clues). The types of clues may differ - * from caller to caller. Thus a typical calling sequence is: + * A caller will often have some extra information about what the encoding might + * be (e.g. from the HTTP header or HTML meta-tags, often wrong but still + * potentially useful clues). The types of clues may differ from caller to + * caller. Thus a typical calling sequence is: * <ul> - * <li>Run step (1) to generate a set of auto-detected clues;</li> - * <li>Combine these clues with the caller-dependent "extra clues" - * available;</li> - * <li>Run step (2) to guess what the most probable answer is.</li> + * <li>Run step (1) to generate a set of auto-detected clues;</li> + * <li>Combine these clues with the caller-dependent "extra clues" available;</li> + * <li>Run step (2) to guess what the most probable answer is.</li> * </p> */ public class EncodingDetector { @@ -89,34 +88,32 @@ public class EncodingDetector { } public String toString() { - return value + " (" + source + - ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")"; + return value + " (" + source + + ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")"; } public boolean isEmpty() { - return (value==null || "".equals(value)); + return (value == null || "".equals(value)); } public boolean meetsThreshold() { - return (confidence < 0 || - (minConfidence >= 0 && confidence >= minConfidence)); + return (confidence < 0 || (minConfidence >= 0 && confidence >= minConfidence)); } } - public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class); + public static final Logger LOG = LoggerFactory + .getLogger(EncodingDetector.class); public static final int NO_THRESHOLD = -1; - public static final String MIN_CONFIDENCE_KEY = - "encodingdetector.charset.min.confidence"; + public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence"; - private static final HashMap<String, String> ALIASES = - new HashMap<String, String>(); + private static final HashMap<String, String> ALIASES = new HashMap<String, String>(); private static final HashSet<String> DETECTABLES = new HashSet<String>(); // CharsetDetector will die without a minimum amount of data. - private static final int MIN_LENGTH=4; + private static final int MIN_LENGTH = 4; static { DETECTABLES.add("text/html"); @@ -129,23 +126,22 @@ public class EncodingDetector { DETECTABLES.add("application/rss+xml"); DETECTABLES.add("application/xhtml+xml"); /* - * the following map is not an alias mapping table, but - * maps character encodings which are often used in mislabelled - * documents to their correct encodings. For instance, - * there are a lot of documents labelled 'ISO-8859-1' which contain - * characters not covered by ISO-8859-1 but covered by windows-1252. - * Because windows-1252 is a superset of ISO-8859-1 (sharing code points - * for the common part), it's better to treat ISO-8859-1 as - * synonymous with windows-1252 than to reject, as invalid, documents - * labelled as ISO-8859-1 that have characters outside ISO-8859-1. + * the following map is not an alias mapping table, but maps character + * encodings which are often used in mislabelled documents to their correct + * encodings. For instance, there are a lot of documents labelled + * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but + * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1 + * (sharing code points for the common part), it's better to treat + * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid, + * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1. */ ALIASES.put("ISO-8859-1", "windows-1252"); ALIASES.put("EUC-KR", "x-windows-949"); ALIASES.put("x-EUC-CN", "GB18030"); ALIASES.put("GBK", "GB18030"); - //ALIASES.put("Big5", "Big5HKSCS"); - //ALIASES.put("TIS620", "Cp874"); - //ALIASES.put("ISO-8859-11", "Cp874"); + // ALIASES.put("Big5", "Big5HKSCS"); + // ALIASES.put("TIS620", "Cp874"); + // ALIASES.put("ISO-8859-11", "Cp874"); } @@ -188,8 +184,9 @@ public class EncodingDetector { } // add character encoding coming from HTTP response header - addClue(parseCharacterEncoding( - content.getMetadata().get(Response.CONTENT_TYPE)), "header"); + addClue( + parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), + "header"); } public void addClue(String value, String source, int confidence) { @@ -208,21 +205,23 @@ public class EncodingDetector { /** * Guess the encoding with the previously specified list of clues. - * - * @param content Content instance - * @param defaultValue Default encoding to return if no encoding can be - * detected with enough confidence. Note that this will <b>not</b> be - * normalized with {@link EncodingDetector#resolveEncodingAlias} - * + * + * @param content + * Content instance + * @param defaultValue + * Default encoding to return if no encoding can be detected with + * enough confidence. Note that this will <b>not</b> be normalized + * with {@link EncodingDetector#resolveEncodingAlias} + * * @return Guessed encoding or defaultValue */ public String guessEncoding(Content content, String defaultValue) { /* - * This algorithm could be replaced by something more sophisticated; - * ideally we would gather a bunch of data on where various clues - * (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with - * the correct answer, and use machine learning/some statistical method - * to generate a better heuristic. + * This algorithm could be replaced by something more sophisticated; ideally + * we would gather a bunch of data on where various clues (autodetect, HTTP + * headers, HTML meta tags, etc.) disagree, tag each with the correct + * answer, and use machine learning/some statistical method to generate a + * better heuristic. */ String base = content.getBaseUrl(); @@ -232,10 +231,9 @@ public class EncodingDetector { } /* - * Go down the list of encoding "clues". Use a clue if: - * 1. Has a confidence value which meets our confidence threshold, OR - * 2. Doesn't meet the threshold, but is the best try, - * since nothing else is available. + * Go down the list of encoding "clues". Use a clue if: 1. Has a confidence + * value which meets our confidence threshold, OR 2. Doesn't meet the + * threshold, but is the best try, since nothing else is available. */ EncodingClue defaultClue = new EncodingClue(defaultValue, "default"); EncodingClue bestClue = defaultClue; @@ -247,8 +245,8 @@ public class EncodingDetector { String charset = clue.value; if (minConfidence >= 0 && clue.confidence >= minConfidence) { if (LOG.isTraceEnabled()) { - LOG.trace(base + ": Choosing encoding: " + charset + - " with confidence " + clue.confidence); + LOG.trace(base + ": Choosing encoding: " + charset + + " with confidence " + clue.confidence); } return resolveEncodingAlias(charset).toLowerCase(); } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) { @@ -268,10 +266,10 @@ public class EncodingDetector { } /* - * Strictly for analysis, look for "disagreements." The top guess from - * each source is examined; if these meet the threshold and disagree, then - * we log the information -- useful for testing or generating training data - * for a better heuristic. + * Strictly for analysis, look for "disagreements." The top guess from each + * source is examined; if these meet the threshold and disagree, then we log + * the information -- useful for testing or generating training data for a + * better heuristic. */ private void findDisagreements(String url, List<EncodingClue> newClues) { HashSet<String> valsSeen = new HashSet<String>(); @@ -293,9 +291,9 @@ public class EncodingDetector { if (disagreement) { // dump all values in case of disagreement StringBuffer sb = new StringBuffer(); - sb.append("Disagreement: "+url+"; "); + sb.append("Disagreement: " + url + "; "); for (int i = 0; i < newClues.size(); i++) { - if (i>0) { + if (i > 0) { sb.append(", "); } sb.append(newClues.get(i)); @@ -310,7 +308,7 @@ public class EncodingDetector { return null; String canonicalName = new String(Charset.forName(encoding).name()); return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName) - : canonicalName; + : canonicalName; } catch (Exception e) { LOG.warn("Invalid encoding " + encoding + " detected, using default."); return null; @@ -318,14 +316,14 @@ public class EncodingDetector { } /** - * Parse the character encoding from the specified content type header. - * If the content type is null, or there is no explicit character encoding, - * <code>null</code> is returned. - * <br /> - * This method was copied from org.apache.catalina.util.RequestUtil, - * which is licensed under the Apache License, Version 2.0 (the "License"). - * - * @param contentType a content type header + * Parse the character encoding from the specified content type header. If the + * content type is null, or there is no explicit character encoding, + * <code>null</code> is returned. <br /> + * This method was copied from org.apache.catalina.util.RequestUtil, which is + * licensed under the Apache License, Version 2.0 (the "License"). + * + * @param contentType + * a content type header */ public static String parseCharacterEncoding(String contentType) { if (contentType == null) @@ -339,7 +337,7 @@ public class EncodingDetector { encoding = encoding.substring(0, end); encoding = encoding.trim(); if ((encoding.length() > 2) && (encoding.startsWith("\"")) - && (encoding.endsWith("\""))) + && (encoding.endsWith("\""))) encoding = encoding.substring(1, encoding.length() - 1); return (encoding.trim()); @@ -352,12 +350,12 @@ public class EncodingDetector { } Configuration conf = NutchConfiguration.create(); - EncodingDetector detector = - new EncodingDetector(NutchConfiguration.create()); + EncodingDetector detector = new EncodingDetector( + NutchConfiguration.create()); // do everything as bytes; don't want any conversion - BufferedInputStream istr = - new BufferedInputStream(new FileInputStream(args[0])); + BufferedInputStream istr = new BufferedInputStream(new FileInputStream( + args[0])); ByteArrayOutputStream ostr = new ByteArrayOutputStream(); byte[] bytes = new byte[1000]; boolean more = true; @@ -376,8 +374,8 @@ public class EncodingDetector { byte[] data = ostr.toByteArray(); // make a fake Content - Content content = - new Content("", "", data, "text/html", new Metadata(), conf); + Content content = new Content("", "", data, "text/html", new Metadata(), + conf); detector.autoDetectClues(content, true); String encoding = detector.guessEncoding(content,
