Update of /cvsroot/nutch/nutch/src/java/net/nutch/fetcher In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2774/src/java/net/nutch/fetcher
Modified Files: Fetcher.java FetcherOutput.java Log Message: Introduce option -noParsing to fetcher and add ParseSegment.java. Index: Fetcher.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/fetcher/Fetcher.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** Fetcher.java 22 Oct 2004 23:47:57 -0000 1.42 --- Fetcher.java 26 Oct 2004 00:03:14 -0000 1.43 *************** *** 18,22 **** import java.util.logging.*; ! /** The fetcher. Most of the work is done by plugins. */ public class Fetcher { --- 18,31 ---- import java.util.logging.*; ! /** ! * The fetcher. Most of the work is done by plugins. ! * ! * <p> ! * Note by John Xing: As of 20041022, option -noParsing is introduced. ! * Without this option, fetcher behaves the old way, i.e., it not only ! * crawls but also parses content. With option -noParsing, fetcher ! * does crawl only. Use ParseSegment.java to parse fetched contents. ! * Check FetcherOutput.java and ParseSegment.java for further description. ! */ public class Fetcher { *************** *** 42,45 **** --- 51,56 ---- private int errors; // total pages errored + private boolean parsing = true; // whether do parsing + private int threadCount = // max number of threads NutchConf.getInt("fetcher.threads.fetch", 10); *************** *** 110,114 **** } catch (ParseException e) { // don't retry logError(url, fle, e); ! handleNoFetch(fle, FetcherOutput.NOT_FOUND); } catch (RetryLater e) { // explicit retry --- 121,125 ---- } catch (ParseException e) { // don't retry logError(url, fle, e); ! handleNoFetch(fle, FetcherOutput.CANT_PARSE); } catch (RetryLater e) { // explicit retry *************** *** 154,164 **** throws ParseException { ! String contentType = content.getContentType(); ! Parser parser = ParserFactory.getParser(contentType, url); ! Parse parse = parser.getParse(content); ! ! outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()), FetcherOutput.SUCCESS), content, new ParseText(parse.getText()), parse.getData()); } --- 165,180 ---- throws ParseException { ! if (Fetcher.this.parsing) { ! String contentType = content.getContentType(); ! Parser parser = ParserFactory.getParser(contentType, url); ! Parse parse = parser.getParse(content); ! outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()), FetcherOutput.SUCCESS), content, new ParseText(parse.getText()), parse.getData()); + } else { + outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()), + FetcherOutput.SUCCESS), + content, null, null); + } } *************** *** 167,205 **** MD5Hash hash = MD5Hash.digest(url); ! outputPage(new FetcherOutput(fle, hash, status), ! new Content(url, url, new byte[0], "", new Properties()), ! new ParseText(""), ! new ParseData("", new Outlink[0], new Properties())); } - } ! private void outputPage(FetcherOutput fo, Content content, ! ParseText text, ParseData parseData) { ! try { ! synchronized (fetcherWriter) { ! fetcherWriter.append(fo); ! contentWriter.append(content); ! parseTextWriter.append(text); ! parseDataWriter.append(parseData); } - } catch (Throwable t) { - LOG.severe("error writing output:" + t.toString()); } - } ! public Fetcher(NutchFileSystem nfs, String directory) throws IOException { // Set up in/out streams fetchList = new ArrayFile.Reader (nfs, new File(directory, FetchListEntry.DIR_NAME).toString()); ! fetcherWriter = new ArrayFile.Writer ! (nfs, new File(directory, FetcherOutput.DIR_NAME).toString(), ! FetcherOutput.class); contentWriter = new ArrayFile.Writer (nfs, new File(directory, Content.DIR_NAME).toString(), Content.class); ! parseTextWriter = new ArrayFile.Writer ! (nfs, new File(directory, ParseText.DIR_NAME).toString(), ParseText.class); ! parseDataWriter = new ArrayFile.Writer ! (nfs, new File(directory, ParseData.DIR_NAME).toString(), ParseData.class); name = new File(directory).getName(); } --- 183,241 ---- MD5Hash hash = MD5Hash.digest(url); ! if (Fetcher.this.parsing) { ! outputPage(new FetcherOutput(fle, hash, status), ! new Content(url, url, new byte[0], "", new Properties()), ! new ParseText(""), ! new ParseData("", new Outlink[0], new Properties())); ! } else { ! outputPage(new FetcherOutput(fle, hash, status), ! new Content(url, url, new byte[0], "", new Properties()), ! null, null); ! } } ! private void outputPage(FetcherOutput fo, Content content, ! ParseText text, ParseData parseData) { ! try { ! synchronized (fetcherWriter) { ! fetcherWriter.append(fo); ! contentWriter.append(content); ! if (Fetcher.this.parsing) { ! parseTextWriter.append(text); ! parseDataWriter.append(parseData); ! } ! } ! } catch (Throwable t) { ! LOG.severe("error writing output:" + t.toString()); } } + } ! public Fetcher(NutchFileSystem nfs, String directory, boolean parsing) ! throws IOException { ! ! this.parsing = parsing; ! // Set up in/out streams fetchList = new ArrayFile.Reader (nfs, new File(directory, FetchListEntry.DIR_NAME).toString()); ! if (this.parsing) { ! fetcherWriter = new ArrayFile.Writer ! (nfs, new File(directory, FetcherOutput.DIR_NAME).toString(), ! FetcherOutput.class); ! } else { ! fetcherWriter = new ArrayFile.Writer ! (nfs, new File(directory, FetcherOutput.DIR_NAME_NP).toString(), ! FetcherOutput.class); ! } contentWriter = new ArrayFile.Writer (nfs, new File(directory, Content.DIR_NAME).toString(), Content.class); ! if (this.parsing) { ! parseTextWriter = new ArrayFile.Writer(nfs, ! new File(directory, ParseText.DIR_NAME).toString(), ParseText.class); ! parseDataWriter = new ArrayFile.Writer(nfs, ! new File(directory, ParseData.DIR_NAME).toString(), ParseData.class); ! } name = new File(directory).getName(); } *************** *** 265,270 **** fetcherWriter.close(); contentWriter.close(); ! parseTextWriter.close(); ! parseDataWriter.close(); } --- 301,308 ---- fetcherWriter.close(); contentWriter.close(); ! if (this.parsing) { ! parseTextWriter.close(); ! parseDataWriter.close(); ! } } *************** *** 329,336 **** long delay = -1; String logLevel = "info"; boolean showThreadID = false; String directory = null; ! String usage = "Usage: Fetcher (-local | -ndfs <namenode:port>) [-logLevel level] [-showThreadID] [-threads n] <dir>"; if (args.length == 0) { --- 367,375 ---- long delay = -1; String logLevel = "info"; + boolean parsing = true; boolean showThreadID = false; String directory = null; ! String usage = "Usage: Fetcher (-local | -ndfs <namenode:port>) [-logLevel level] [-noParsing] [-showThreadID] [-threads n] <dir>"; if (args.length == 0) { *************** *** 348,351 **** --- 387,392 ---- } else if (args[i].equals("-logLevel")) { logLevel = args[++i]; + } else if (args[i].equals("-noParsing")) { + parsing = false; } else if (args[i].equals("-showThreadID")) { showThreadID = true; *************** *** 354,358 **** } ! Fetcher fetcher = new Fetcher(nfs, directory);// make a Fetcher if (threadCount != -1) { // set threadCount option fetcher.setThreadCount(threadCount); --- 395,399 ---- } ! Fetcher fetcher = new Fetcher(nfs, directory, parsing);// make a Fetcher if (threadCount != -1) { // set threadCount option fetcher.setThreadCount(threadCount); *************** *** 371,374 **** --- 412,416 ---- nfs.close(); } + } } Index: FetcherOutput.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/fetcher/FetcherOutput.java,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** FetcherOutput.java 20 Aug 2004 20:36:10 -0000 1.22 --- FetcherOutput.java 26 Oct 2004 00:03:15 -0000 1.23 *************** *** 19,26 **** --- 19,37 ---- * separate files. * + * <p> + * Note by John Xing: As of 20041022, option -noParsing is introduced + * in Fetcher.java. This changes fetcher behavior. Accordingly + * there are necessary modifications in this class. + * Check Fetcher.java and ParseSegment.java for details. + * * @author Doug Cutting *********************************************/ public final class FetcherOutput implements Writable { public static final String DIR_NAME = "fetcher"; + // 20041024, xing, + // When fetcher is run with option -noParsing, DIR_NAME_NP is created + // instead of DIR_NAME. In separate pass, ParseSegment.java looks for + // DIR_NAME_NP and generates DIR_NAME. Check ParseSegment.java for more info. + public static final String DIR_NAME_NP = DIR_NAME+"_output"; public static final String DONE_NAME = "fetcher.done"; public static final String ERROR_NAME = "fetcher.error"; *************** *** 31,34 **** --- 42,46 ---- public final static byte SUCCESS = 1; public final static byte NOT_FOUND = 2; + public final static byte CANT_PARSE = 3; // fetched, but can't be parsed private FetchListEntry fetchListEntry; *************** *** 86,89 **** --- 98,102 ---- public MD5Hash getMD5Hash() { return md5Hash; } public int getStatus() { return status; } + public void setStatus(int status) { this.status = status; } public long getFetchDate() { return fetchDate; } public void setFetchDate(long fetchDate) { this.fetchDate = fetchDate; } ------------------------------------------------------- This SF.net email is sponsored by: IT Product Guide on ITManagersJournal Use IT products in your business? Tell us what you think of them. Give us Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more http://productguide.itmanagersjournal.com/guidepromo.tmpl _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs