indexer IndexSegment.java,1.23,1.24

Andrzej Bialecki Mon, 22 Nov 2004 07:00:50 -0800

Update of /cvsroot/nutch/nutch/src/java/net/nutch/indexer
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26341


Modified Files:
        IndexSegment.java 
Log Message:
* Change this to use SegmentReader API, with autofixing on by default.

* Improve logging.



Index: IndexSegment.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/indexer/IndexSegment.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** IndexSegment.java   12 Nov 2004 17:04:04 -0000      1.23
--- IndexSegment.java   22 Nov 2004 14:55:23 -0000      1.24
***************
*** 11,14 ****
--- 11,15 ----
  import net.nutch.db.*;
  import net.nutch.io.*;
+ import net.nutch.segment.SegmentReader;
  import net.nutch.util.*;
  
***************
*** 27,30 ****
--- 28,33 ----
    public static final Logger LOG =
      LogFormatter.getLogger("net.nutch.index.IndexSegment");
+   
+   public static int LOG_STEP = 20000;
  
    private boolean boostByLinkCount =
***************
*** 34,38 ****
    private int maxFieldLength = NutchConf.getInt("indexer.max.tokens", 10000);
    private NutchFileSystem nfs;
!   private int maxDocs = Integer.MAX_VALUE;
    private File srcDir;
    private File localWorkingDir;
--- 37,41 ----
    private int maxFieldLength = NutchConf.getInt("indexer.max.tokens", 10000);
    private NutchFileSystem nfs;
!   private long maxDocs = Long.MAX_VALUE;
    private File srcDir;
    private File localWorkingDir;
***************
*** 41,45 ****
     * Index a segment in the given NFS.
     */
!   public IndexSegment(NutchFileSystem nfs, int maxDocs, File srcDir, File 
localWorkingDir) {
        this.nfs = nfs;
        this.maxDocs = maxDocs;
--- 44,48 ----
     * Index a segment in the given NFS.
     */
!   public IndexSegment(NutchFileSystem nfs, long maxDocs, File srcDir, File 
localWorkingDir) {
        this.nfs = nfs;
        this.maxDocs = maxDocs;
***************
*** 77,105 ****
        writer.minMergeDocs = 50;
        writer.maxFieldLength = maxFieldLength;
!       writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO);
        writer.setUseCompoundFile(false);
        writer.setSimilarity(new NutchSimilarity());
  
!       ArrayFile.Reader fetcher = null;
!       ArrayFile.Reader text = null;
!       ArrayFile.Reader data = null;
  
!       int count = 0;
        try {
!           fetcher = new ArrayFile.Reader(nfs, new File(srcDir, 
FetcherOutput.DIR_NAME).toString());
!           text = new ArrayFile.Reader(nfs, new File(srcDir, 
ParseText.DIR_NAME).toString());
!           data = new ArrayFile.Reader(nfs, new File(srcDir, 
ParseData.DIR_NAME).toString());
  
            String segmentName = srcDir.getCanonicalFile().getName();
            FetcherOutput fetcherOutput = new FetcherOutput();
            ParseText parseText = new ParseText();
            ParseData parseData = new ParseData();
  
            //
            // Iterate through all docs in the input
            //
!           while (fetcher.next(fetcherOutput) != null && count++ < maxDocs) {
!               text.next(parseText);
!               data.next(parseData);
  
                // only index the page if it was fetched correctly
--- 80,111 ----
        writer.minMergeDocs = 50;
        writer.maxFieldLength = maxFieldLength;
!       //writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO);
        writer.setUseCompoundFile(false);
        writer.setSimilarity(new NutchSimilarity());
  
!       SegmentReader sr = null;
  
!       long start = System.currentTimeMillis();
!       long delta = start;
!       long curTime, total = 0;
!       long count = 0;
        try {
!           LOG.info("* Opening segment " + srcDir.getName());
!           sr = new SegmentReader(nfs, srcDir, true);
  
+           total = sr.size;
+           
            String segmentName = srcDir.getCanonicalFile().getName();
            FetcherOutput fetcherOutput = new FetcherOutput();
            ParseText parseText = new ParseText();
            ParseData parseData = new ParseData();
+           LOG.info("* Indexing segment " + srcDir.getName());
  
            //
            // Iterate through all docs in the input
            //
!           maxDocs = Math.min(sr.size, maxDocs);
!           for (count = 0; count < maxDocs; count++) {
!               if (!sr.get(count, fetcherOutput, null, parseText, parseData)) 
continue;
  
                // only index the page if it was fetched correctly
***************
*** 112,116 ****
  
                // build initial document w/ core fields
!               Document doc = makeDocument(segmentName, fetcher.key(),
                                            fetcherOutput, parse);
  
--- 118,122 ----
  
                // build initial document w/ core fields
!               Document doc = makeDocument(segmentName, count,
                                            fetcherOutput, parse);
  
***************
*** 120,123 ****
--- 126,136 ----
                // add the document to the index
                writer.addDocument(doc);
+               if (count > 0 && count % LOG_STEP == 0) {
+                 curTime = System.currentTimeMillis();
+                 LOG.info(" Processed " + count + " records (" +
+                         ((float)LOG_STEP * 1000.0f / (float)(curTime - 
delta)) +
+                         " rec/s)");
+                 delta = curTime;
+               }
            }
        } catch (EOFException e) {
***************
*** 125,139 ****
                        " at entry #" + count + ".  Ignoring.");
        } finally {
!           if (fetcher != null) {
!               fetcher.close();
!           }
!           if (text != null) {
!               text.close();
!           }
!           if (data != null) {
!               data.close();
!           }
        }
!       LOG.info("Optimizing index...");
        writer.optimize();
        writer.close();
--- 138,144 ----
                        " at entry #" + count + ".  Ignoring.");
        } finally {
!         sr.close();
        }
!       LOG.info("* Optimizing index...");
        writer.optimize();
        writer.close();
***************
*** 143,146 ****
--- 148,152 ----
        //
        //nfs.completeLocalOutput(new File(outputDir, "index"), new 
File(srcDir, "index"));
+       LOG.info("* Moving index to NFS if needed...");
        nfs.completeLocalOutput(outputIndex, tmpOutputIndex);
  
***************
*** 150,153 ****
--- 156,163 ----
        OutputStream out = nfs.create(doneFile);
        out.close();
+       delta = System.currentTimeMillis() - start;
+       float eps = (float) count / (float) (delta / 1000);
+       LOG.info("DONE indexing segment " + srcDir.getName() + ": total " + 
total +
+               " records in " + ((float) delta / 1000f) + " s (" + eps + " 
rec/s).");
    }
  



-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] nutch/src/java/net/nutch/indexer IndexSegment.java,1.23,1.24

Reply via email to