fetcher Fetcher.java,1.42,1.43 FetcherOutput.java,1.22,1.23

John Mon, 25 Oct 2004 18:01:07 -0700

Update of /cvsroot/nutch/nutch/src/java/net/nutch/fetcher
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2774/src/java/net/nutch/fetcher


Modified Files:
        Fetcher.java FetcherOutput.java 
Log Message:
Introduce option -noParsing to fetcher and add ParseSegment.java.


Index: Fetcher.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/fetcher/Fetcher.java,v
retrieving revision 1.42
retrieving revision 1.43
diff -C2 -d -r1.42 -r1.43
*** Fetcher.java        22 Oct 2004 23:47:57 -0000      1.42
--- Fetcher.java        26 Oct 2004 00:03:14 -0000      1.43
***************
*** 18,22 ****
  import java.util.logging.*;
  
! /** The fetcher.  Most of the work is done by plugins. */
  public class Fetcher {
  
--- 18,31 ----
  import java.util.logging.*;
  
! /**
!  * The fetcher. Most of the work is done by plugins.
!  *
!  * <p>
!  * Note by John Xing: As of 20041022, option -noParsing is introduced.
!  * Without this option, fetcher behaves the old way, i.e., it not only
!  * crawls but also parses content. With option -noParsing, fetcher
!  * does crawl only. Use ParseSegment.java to parse fetched contents.
!  * Check FetcherOutput.java and ParseSegment.java for further description.
!  */
  public class Fetcher {
  
***************
*** 42,45 ****
--- 51,56 ----
    private int errors;                             // total pages errored
  
+   private boolean parsing = true;                 // whether do parsing
+ 
    private int threadCount =                       // max number of threads
      NutchConf.getInt("fetcher.threads.fetch", 10);
***************
*** 110,114 ****
          } catch (ParseException e) {              // don't retry
            logError(url, fle, e);
!           handleNoFetch(fle, FetcherOutput.NOT_FOUND);
  
          } catch (RetryLater e) {                  // explicit retry
--- 121,125 ----
          } catch (ParseException e) {              // don't retry
            logError(url, fle, e);
!           handleNoFetch(fle, FetcherOutput.CANT_PARSE);
  
          } catch (RetryLater e) {                  // explicit retry
***************
*** 154,164 ****
        throws ParseException {
  
!       String contentType = content.getContentType();
!       Parser parser = ParserFactory.getParser(contentType, url);
!       Parse parse = parser.getParse(content);
!       
!       outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
                                     FetcherOutput.SUCCESS),
                   content, new ParseText(parse.getText()), parse.getData());
      }
  
--- 165,180 ----
        throws ParseException {
  
!       if (Fetcher.this.parsing) {
!         String contentType = content.getContentType();
!         Parser parser = ParserFactory.getParser(contentType, url);
!         Parse parse = parser.getParse(content);
!         outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
                                     FetcherOutput.SUCCESS),
                   content, new ParseText(parse.getText()), parse.getData());
+       } else {
+         outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+                                    FetcherOutput.SUCCESS),
+                  content, null, null);
+       }
      }
  
***************
*** 167,205 ****
        MD5Hash hash = MD5Hash.digest(url);
  
!       outputPage(new FetcherOutput(fle, hash, status),
!                  new Content(url, url, new byte[0], "", new Properties()),
!                  new ParseText(""),
!                  new ParseData("", new Outlink[0], new Properties()));
      }
-   }
        
!   private void outputPage(FetcherOutput fo, Content content,
!                           ParseText text, ParseData parseData) {
!     try {
!       synchronized (fetcherWriter) {
!         fetcherWriter.append(fo);
!         contentWriter.append(content);
!         parseTextWriter.append(text);
!         parseDataWriter.append(parseData);
        }
-     } catch (Throwable t) {
-       LOG.severe("error writing output:" + t.toString());
      }
-   }
                                         
                        
!   public Fetcher(NutchFileSystem nfs, String directory) throws IOException {
      // Set up in/out streams
      fetchList = new ArrayFile.Reader
        (nfs, new File(directory, FetchListEntry.DIR_NAME).toString());
!     fetcherWriter = new ArrayFile.Writer
!       (nfs, new File(directory, FetcherOutput.DIR_NAME).toString(),
!        FetcherOutput.class);
      contentWriter = new ArrayFile.Writer
        (nfs, new File(directory, Content.DIR_NAME).toString(), Content.class);
!     parseTextWriter = new ArrayFile.Writer
!       (nfs, new File(directory, ParseText.DIR_NAME).toString(), ParseText.class);
!     parseDataWriter = new ArrayFile.Writer
!       (nfs, new File(directory, ParseData.DIR_NAME).toString(), ParseData.class);
      name = new File(directory).getName();
    }
--- 183,241 ----
        MD5Hash hash = MD5Hash.digest(url);
  
!       if (Fetcher.this.parsing) {
!         outputPage(new FetcherOutput(fle, hash, status),
!                    new Content(url, url, new byte[0], "", new Properties()),
!                    new ParseText(""),
!                    new ParseData("", new Outlink[0], new Properties()));
!       } else {
!         outputPage(new FetcherOutput(fle, hash, status),
!                    new Content(url, url, new byte[0], "", new Properties()),
!                    null, null);
!       }
      }
        
!     private void outputPage(FetcherOutput fo, Content content,
!                             ParseText text, ParseData parseData) {
!       try {
!         synchronized (fetcherWriter) {
!           fetcherWriter.append(fo);
!           contentWriter.append(content);
!           if (Fetcher.this.parsing) {
!             parseTextWriter.append(text);
!             parseDataWriter.append(parseData);
!           }
!         }
!       } catch (Throwable t) {
!         LOG.severe("error writing output:" + t.toString());
        }
      }
                                         
+   }
                        
!   public Fetcher(NutchFileSystem nfs, String directory, boolean parsing)
!     throws IOException {
! 
!     this.parsing = parsing;
! 
      // Set up in/out streams
      fetchList = new ArrayFile.Reader
        (nfs, new File(directory, FetchListEntry.DIR_NAME).toString());
!     if (this.parsing) {
!       fetcherWriter = new ArrayFile.Writer
!         (nfs, new File(directory, FetcherOutput.DIR_NAME).toString(),
!         FetcherOutput.class);
!     } else {
!       fetcherWriter = new ArrayFile.Writer
!         (nfs, new File(directory, FetcherOutput.DIR_NAME_NP).toString(),
!         FetcherOutput.class);
!     }
      contentWriter = new ArrayFile.Writer
        (nfs, new File(directory, Content.DIR_NAME).toString(), Content.class);
!     if (this.parsing) {
!       parseTextWriter = new ArrayFile.Writer(nfs,
!         new File(directory, ParseText.DIR_NAME).toString(), ParseText.class);
!       parseDataWriter = new ArrayFile.Writer(nfs,
!         new File(directory, ParseData.DIR_NAME).toString(), ParseData.class);
!     }
      name = new File(directory).getName();
    }
***************
*** 265,270 ****
      fetcherWriter.close();
      contentWriter.close();
!     parseTextWriter.close();
!     parseDataWriter.close();
  
    }
--- 301,308 ----
      fetcherWriter.close();
      contentWriter.close();
!     if (this.parsing) {
!       parseTextWriter.close();
!       parseDataWriter.close();
!     }
  
    }
***************
*** 329,336 ****
      long delay = -1;
      String logLevel = "info";
      boolean showThreadID = false;
      String directory = null;
  
!     String usage = "Usage: Fetcher (-local | -ndfs <namenode:port>) [-logLevel 
level] [-showThreadID] [-threads n] <dir>";
  
      if (args.length == 0) {
--- 367,375 ----
      long delay = -1;
      String logLevel = "info";
+     boolean parsing = true;
      boolean showThreadID = false;
      String directory = null;
  
!     String usage = "Usage: Fetcher (-local | -ndfs <namenode:port>) [-logLevel 
level] [-noParsing] [-showThreadID] [-threads n] <dir>";
  
      if (args.length == 0) {
***************
*** 348,351 ****
--- 387,392 ----
        } else if (args[i].equals("-logLevel")) {
          logLevel = args[++i];
+       } else if (args[i].equals("-noParsing")) {
+         parsing = false;
        } else if (args[i].equals("-showThreadID")) {
          showThreadID = true;
***************
*** 354,358 ****
      }
  
!     Fetcher fetcher = new Fetcher(nfs, directory);// make a Fetcher
      if (threadCount != -1) {                      // set threadCount option
        fetcher.setThreadCount(threadCount);
--- 395,399 ----
      }
  
!     Fetcher fetcher = new Fetcher(nfs, directory, parsing);// make a Fetcher
      if (threadCount != -1) {                      // set threadCount option
        fetcher.setThreadCount(threadCount);
***************
*** 371,374 ****
--- 412,416 ----
        nfs.close();
      }
+ 
    }
  }

Index: FetcherOutput.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/fetcher/FetcherOutput.java,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** FetcherOutput.java  20 Aug 2004 20:36:10 -0000      1.22
--- FetcherOutput.java  26 Oct 2004 00:03:15 -0000      1.23
***************
*** 19,26 ****
--- 19,37 ----
   * separate files.
   *
+  * <p>
+  * Note by John Xing: As of 20041022, option -noParsing is introduced
+  * in Fetcher.java. This changes fetcher behavior. Accordingly
+  * there are necessary modifications in this class.
+  * Check Fetcher.java and ParseSegment.java for details.
+  *
   * @author Doug Cutting
   *********************************************/
  public final class FetcherOutput implements Writable {
    public static final String DIR_NAME = "fetcher";
+   // 20041024, xing, 
+   // When fetcher is run with option -noParsing, DIR_NAME_NP is created
+   // instead of DIR_NAME. In separate pass, ParseSegment.java looks for
+   // DIR_NAME_NP and generates DIR_NAME. Check ParseSegment.java for more info.
+   public static final String DIR_NAME_NP = DIR_NAME+"_output";
    public static final String DONE_NAME = "fetcher.done";
    public static final String ERROR_NAME = "fetcher.error";
***************
*** 31,34 ****
--- 42,46 ----
    public final static byte SUCCESS = 1;
    public final static byte NOT_FOUND = 2;
+   public final static byte CANT_PARSE = 3; // fetched, but can't be parsed
  
    private FetchListEntry fetchListEntry;
***************
*** 86,89 ****
--- 98,102 ----
    public MD5Hash getMD5Hash() { return md5Hash; }
    public int getStatus() { return status; }
+   public void setStatus(int status) { this.status = status; }
    public long getFetchDate() { return fetchDate; }
    public void setFetchDate(long fetchDate) { this.fetchDate = fetchDate; }



-------------------------------------------------------
This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
Use IT products in your business? Tell us what you think of them. Give us
Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more
http://productguide.itmanagersjournal.com/guidepromo.tmpl
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] nutch/src/java/net/nutch/fetcher Fetcher.java,1.42,1.43 FetcherOutput.java,1.22,1.23

Reply via email to