Here is DumpSegment.java, which I use to debug. The output is sorted by url.
Doug: where should I put it, ./src/java/net/nutch/fetcher or ./src/java/net/nutch/util ? John ------------------------------ DumpSegment.java ------------------------------ /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.fetcher; import net.nutch.pagedb.FetchListEntry; import net.nutch.io.*; import net.nutch.util.*; import net.nutch.protocol.*; import net.nutch.parse.*; //import net.nutch.plugin.*; import java.io.File; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Properties; import java.util.logging.*; /******************************** * Dump contents in one segment. * * @author John Xing ********************************/ public class DumpSegment { public static final Logger LOG = LogFormatter.getLogger("net.nutch.fetcher.DumpSegment"); // for stats private long start; // start time private long bytes; // total bytes parsed private int pages; // total pages parsed private int errors; // total pages errored private String fetcherFile; private String contentFile; private String parseDataFile; private String parseTextFile; private String unsortedFile; private String sortedFile; // ctor public DumpSegment(String directory) { this.fetcherFile = directory+"/"+FetcherOutput.DIR_NAME; this.contentFile = directory+"/"+Content.DIR_NAME; this.parseDataFile = directory+"/"+ParseData.DIR_NAME; this.parseTextFile = directory+"/"+ParseText.DIR_NAME; this.unsortedFile = directory+"/"+FetcherOutput.DIR_NAME+".url.unsorted"; this.sortedFile = directory+"/"+FetcherOutput.DIR_NAME+".url.sorted"; } // dump public void dump() throws IOException { ArrayFile.Reader fetcherReader = new ArrayFile.Reader(fetcherFile); ArrayFile.Reader contentReader = new ArrayFile.Reader(contentFile); ArrayFile.Reader parseDataReader = new ArrayFile.Reader(parseDataFile); ArrayFile.Reader parseTextReader = new ArrayFile.Reader(parseTextFile); SequenceFile.Reader seqReader = new SequenceFile.Reader(sortedFile); UTF8 url = new UTF8(); LongWritable entry = new LongWritable(); FetcherOutput fetcherOutput = new FetcherOutput(); Content content = new Content(); ParseData parseData = new ParseData(); ParseText parseText = new ParseText(); while (seqReader.next(url,entry)) { String urlString = url.toString(); long recno = entry.get(); if (fetcherReader.get(recno, fetcherOutput) == null || contentReader.get(recno, content) == null || parseDataReader.get(recno, parseData) == null || parseTextReader.get(recno, parseText) == null) break; //System.out.print("URL:: "+urlString+"\n"); System.out.print("Recno:: "+recno+"\n"); System.out.print("FetcherOutput::\n"); System.out.print(fetcherOutput.toString()); System.out.print("ParseData::\n"); System.out.print(parseData.toString()); System.out.print("ParseText::\n"); System.out.print(parseText.toString()); System.out.print("\n"); } fetcherReader.close(); contentReader.close(); parseDataReader.close(); parseTextReader.close(); seqReader.close(); new File(sortedFile).delete(); } // sort public void sort() throws IOException { // make a SequenceFile ArrayFile.Reader fetcherReader = new ArrayFile.Reader(fetcherFile); SequenceFile.Writer seqWriter = new SequenceFile.Writer (unsortedFile, UTF8.class, LongWritable.class); FetchListEntry fle; String urlString; FetcherOutput fetcherOutput = new FetcherOutput(); long count = 0; while (fetcherReader.next(fetcherOutput) != null) { fle = fetcherOutput.getFetchListEntry(); urlString = fle.getPage().getURL().toString(); seqWriter.append(new UTF8(urlString), new LongWritable(count)); count++; } fetcherReader.close(); seqWriter.close(); // sort the SequenceFile long start = System.currentTimeMillis(); SequenceFile.Sorter sorter = new SequenceFile.Sorter (new UTF8.Comparator(), LongWritable.class); sorter.sort(unsortedFile, sortedFile); double localSecs = (System.currentTimeMillis() - start) / 1000.0; LOG.info("Sorted: " + count + " entries in " + localSecs + "s, " + (count/localSecs) + " entries/s"); new File(unsortedFile).delete(); } // run it public static void main(String[] args) throws Exception { boolean dump = false; String which = null; String logLevel = "info"; String directory = null; //String usage = "Usage: DumpSegment [-logLevel level] dir"; String usage = "Usage: DumpSegment dir"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } // parse command line for (int i = 0; i < args.length; i++) { if (args[i].equals("-logLevel")) { logLevel = args[++i]; } else if (i != args.length-1) { System.err.println(usage); System.exit(-1); } else { directory = args[i]; } } DumpSegment dumpSegment = new DumpSegment(directory); LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); dumpSegment.sort(); dumpSegment.dump(); } } ------------------------------------------------------- This SF.Net email is sponsored by BEA Weblogic Workshop FREE Java Enterprise J2EE developer tools! Get your free copy of BEA WebLogic Workshop 8.1 today. http://ads.osdn.com/?ad_id=4721&alloc_id=10040&op=click _______________________________________________ Nutch-developers mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-developers
