Dear Developers,

I think the index size / record number is critical if we would like performance. So, I think after we dedup and prune indexes, we need to remove these deleted entries from indexes. I writed a simple tool that make it, and list the real number of indexed records in the segments.
If the IndexOptimize tool is truly depricated, please replace with it.
There is the source code:
-----------------------------------------------------------------------------
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.indexer;

import java.io.EOFException;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Vector;
import java.util.logging.Logger;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.nutch.util.LogFormatter;

/**
* This class optimizes segment(s) indexes, and count indexed records in segments.
*
* @author Ferenc Lutischan;
*/
public class OptimizeIndex {
public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.index.OptimizeIndex");

 /** Command-line wrapper. Run without arguments to see usage help. */
 public static void main(String[] args) throws Exception {
   if (args.length == 0) {
     usage();
     return;
   }
   String segDir = null;
   Vector dirs = new Vector();
   for (int i = 0; i < args.length; i++) {
     if (args[i] != null) {
   if (args[i].equals("-dir")) segDir = args[++i];
       else dirs.add(new File(args[i]));
     }
   }
   if (segDir != null) {
     File sDir = new File(segDir);
     if (!sDir.exists() || !sDir.isDirectory()) {
       LOG.warning("Invalid path: " + sDir);
     } else {
       File[] files = sDir.listFiles(new FileFilter() {
         public boolean accept(File f) {
           return f.isDirectory();
         }
       });
       if (files != null && files.length > 0) {
         for (int i = 0; i < files.length; i++) dirs.add(files[i]);
       }
     }
   }
   if (dirs.size() == 0) {
     LOG.severe("No input segment dirs.");
     usage();
     return;
   }
   IndexReader ir = null;
   long total = 0L;
   int cnt = 0;
   LOG.info("INDEXED\t\tDIR NAME");
   for (int i = 0; i < dirs.size(); i++) {
     File dir = (File)dirs.get(i);
     IndexWriter iw = null;
     try {
iw = new IndexWriter(dir+"/index", new org.apache.lucene.analysis.WhitespaceAnalyzer(), false);
       iw.setUseCompoundFile(false);
         iw.optimize();
         iw.close();

       ir = IndexReader.open(dir+"/index");
       total += ir.numDocs();
       cnt++;
       LOG.info(String.valueOf(ir.numDocs())+"\t\t"+dir);
       ir.close();
     } catch (Throwable t) {
       LOG.warning(t.getMessage());
         if (iw != null) try {
           iw.close();
         } catch (Exception e1) {}
     }
   }
   LOG.info("INDEXED: " + total + " entries in " + cnt + " segments.");
 }

 private static void usage() {
   System.err.println("OptimzeIndex (-dir segments | seg1 seg2 ...)");
System.err.println("\tNOTE: at least one segment dir name is required, or '-dir' option."); System.err.println("\t-dir segments\tdirectory containing multiple segments");
   System.err.println("\tseg1 seg2 ...\tsegment directories\n");
 }
}
----------------------------------------------------------------------------------

Regards,
   Ferenc


-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=7477&alloc_id=16492&op=click
_______________________________________________
Nutch-developers mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-developers

Reply via email to