Update of /cvsroot/nutch/nutch/src/java/net/nutch/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26341
Modified Files: IndexSegment.java Log Message: * Change this to use SegmentReader API, with autofixing on by default. * Improve logging. Index: IndexSegment.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/indexer/IndexSegment.java,v retrieving revision 1.23 retrieving revision 1.24 diff -C2 -d -r1.23 -r1.24 *** IndexSegment.java 12 Nov 2004 17:04:04 -0000 1.23 --- IndexSegment.java 22 Nov 2004 14:55:23 -0000 1.24 *************** *** 11,14 **** --- 11,15 ---- import net.nutch.db.*; import net.nutch.io.*; + import net.nutch.segment.SegmentReader; import net.nutch.util.*; *************** *** 27,30 **** --- 28,33 ---- public static final Logger LOG = LogFormatter.getLogger("net.nutch.index.IndexSegment"); + + public static int LOG_STEP = 20000; private boolean boostByLinkCount = *************** *** 34,38 **** private int maxFieldLength = NutchConf.getInt("indexer.max.tokens", 10000); private NutchFileSystem nfs; ! private int maxDocs = Integer.MAX_VALUE; private File srcDir; private File localWorkingDir; --- 37,41 ---- private int maxFieldLength = NutchConf.getInt("indexer.max.tokens", 10000); private NutchFileSystem nfs; ! private long maxDocs = Long.MAX_VALUE; private File srcDir; private File localWorkingDir; *************** *** 41,45 **** * Index a segment in the given NFS. */ ! public IndexSegment(NutchFileSystem nfs, int maxDocs, File srcDir, File localWorkingDir) { this.nfs = nfs; this.maxDocs = maxDocs; --- 44,48 ---- * Index a segment in the given NFS. */ ! public IndexSegment(NutchFileSystem nfs, long maxDocs, File srcDir, File localWorkingDir) { this.nfs = nfs; this.maxDocs = maxDocs; *************** *** 77,105 **** writer.minMergeDocs = 50; writer.maxFieldLength = maxFieldLength; ! writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO); writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); ! ArrayFile.Reader fetcher = null; ! ArrayFile.Reader text = null; ! ArrayFile.Reader data = null; ! int count = 0; try { ! fetcher = new ArrayFile.Reader(nfs, new File(srcDir, FetcherOutput.DIR_NAME).toString()); ! text = new ArrayFile.Reader(nfs, new File(srcDir, ParseText.DIR_NAME).toString()); ! data = new ArrayFile.Reader(nfs, new File(srcDir, ParseData.DIR_NAME).toString()); String segmentName = srcDir.getCanonicalFile().getName(); FetcherOutput fetcherOutput = new FetcherOutput(); ParseText parseText = new ParseText(); ParseData parseData = new ParseData(); // // Iterate through all docs in the input // ! while (fetcher.next(fetcherOutput) != null && count++ < maxDocs) { ! text.next(parseText); ! data.next(parseData); // only index the page if it was fetched correctly --- 80,111 ---- writer.minMergeDocs = 50; writer.maxFieldLength = maxFieldLength; ! //writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO); writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); ! SegmentReader sr = null; ! long start = System.currentTimeMillis(); ! long delta = start; ! long curTime, total = 0; ! long count = 0; try { ! LOG.info("* Opening segment " + srcDir.getName()); ! sr = new SegmentReader(nfs, srcDir, true); + total = sr.size; + String segmentName = srcDir.getCanonicalFile().getName(); FetcherOutput fetcherOutput = new FetcherOutput(); ParseText parseText = new ParseText(); ParseData parseData = new ParseData(); + LOG.info("* Indexing segment " + srcDir.getName()); // // Iterate through all docs in the input // ! maxDocs = Math.min(sr.size, maxDocs); ! for (count = 0; count < maxDocs; count++) { ! if (!sr.get(count, fetcherOutput, null, parseText, parseData)) continue; // only index the page if it was fetched correctly *************** *** 112,116 **** // build initial document w/ core fields ! Document doc = makeDocument(segmentName, fetcher.key(), fetcherOutput, parse); --- 118,122 ---- // build initial document w/ core fields ! Document doc = makeDocument(segmentName, count, fetcherOutput, parse); *************** *** 120,123 **** --- 126,136 ---- // add the document to the index writer.addDocument(doc); + if (count > 0 && count % LOG_STEP == 0) { + curTime = System.currentTimeMillis(); + LOG.info(" Processed " + count + " records (" + + ((float)LOG_STEP * 1000.0f / (float)(curTime - delta)) + + " rec/s)"); + delta = curTime; + } } } catch (EOFException e) { *************** *** 125,139 **** " at entry #" + count + ". Ignoring."); } finally { ! if (fetcher != null) { ! fetcher.close(); ! } ! if (text != null) { ! text.close(); ! } ! if (data != null) { ! data.close(); ! } } ! LOG.info("Optimizing index..."); writer.optimize(); writer.close(); --- 138,144 ---- " at entry #" + count + ". Ignoring."); } finally { ! sr.close(); } ! LOG.info("* Optimizing index..."); writer.optimize(); writer.close(); *************** *** 143,146 **** --- 148,152 ---- // //nfs.completeLocalOutput(new File(outputDir, "index"), new File(srcDir, "index")); + LOG.info("* Moving index to NFS if needed..."); nfs.completeLocalOutput(outputIndex, tmpOutputIndex); *************** *** 150,153 **** --- 156,163 ---- OutputStream out = nfs.create(doneFile); out.close(); + delta = System.currentTimeMillis() - start; + float eps = (float) count / (float) (delta / 1000); + LOG.info("DONE indexing segment " + srcDir.getName() + ": total " + total + + " records in " + ((float) delta / 1000f) + " s (" + eps + " rec/s)."); } ------------------------------------------------------- SF email is sponsored by - The IT Product Guide Read honest & candid reviews on hundreds of IT Products from real users. Discover which products truly live up to the hype. Start reading now. http://productguide.itmanagersjournal.com/ _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs