Hello George, Here is a quick hack (with a few TODOs). I only tested it a bit, so the actual delete calls are still commented out. If this works for you, and especially if you take care of TODOs, I may put this in the Lucene Sandbox.
Otis P.S. Usage example showing how the fool found some unused segments (this was caused by a bug in one of the earlier 1.4 versions of Lucene). [EMAIL PROTECTED] java]$ java org.apache.lucene.index.SegmentPurger /simpy/users/1/index Candidate non-Lucene file found: _1b2.del Candidate unused Lucene file found: _1b2.cfs Candidate unused Lucene file found: _1bm.cfs Candidate unused Lucene file found: _1c6.cfs Candidate unused Lucene file found: _1cq.cfs Candidate unused Lucene file found: _1da.cfs Candidate unused Lucene file found: _1du.cfs Candidate unused Lucene file found: _1ee.cfs Candidate unused Lucene file found: _1ey.cfs [EMAIL PROTECTED] java]$ [EMAIL PROTECTED] java]$ strings /simpy/users/1/index/segments _3o0 [EMAIL PROTECTED] java]$ ls -al /simpy/users/1/index/ total 647 drwxrwsr-x 2 otis simpy 1024 Dec 7 14:39 . drwxrwsr-x 3 otis simpy 1024 Sep 16 20:39 .. -rw-rw-r-- 1 otis simpy 212815 Nov 17 18:36 _1b2.cfs -rw-rw-r-- 1 otis simpy 104 Nov 17 18:40 _1b2.del -rw-rw-r-- 1 otis simpy 3380 Nov 17 18:40 _1bm.cfs -rw-rw-r-- 1 otis simpy 3533 Nov 17 18:40 _1c6.cfs -rw-rw-r-- 1 otis simpy 4774 Nov 17 18:40 _1cq.cfs -rw-rw-r-- 1 otis simpy 3389 Nov 17 18:40 _1da.cfs -rw-rw-r-- 1 otis simpy 3809 Nov 17 18:40 _1du.cfs -rw-rw-r-- 1 otis simpy 3423 Nov 17 18:40 _1ee.cfs -rw-rw-r-- 1 otis simpy 4016 Nov 17 18:40 _1ey.cfs -rw-rw-r-- 1 otis simpy 410299 Dec 7 14:39 _3o0.cfs -rw-rw-r-- 1 otis simpy 4 Dec 7 14:39 deletable -rw-rw-r-- 1 otis simpy 29 Dec 7 14:39 segments --- [EMAIL PROTECTED] wrote: > Hello all. > > > > I recently ran into a problem where errors during indexing or > optimization > (perhaps related to running out of disk space) left me with a working > index > in a directory but with additional segment files (partial) that were > unneeded. The solution for finding the ~40 files to keep out of the > ~900 > files in the directory amounted to dumping the segments file and > noting that > only 5 segments were in fact "live". The index is a non-compound > index > using FSDirectory. > > > > Is there (or would it be possible to add (and I'd be willing to > submit code > if it made sense to people)) some sort of interrogation on the index > of what > files belonged to it? I looked first as FSDirectory itself thinking > that > it's "list()" method should return the subset of index-related files > but > looking deeper it looks like Directory is at a lower level > abstracting > simple I/O and thus wouldn't "know". > > > > So any thoughts? Would it make sense to have a form of clean on > IndexWriter()? I hesitate since it seems there isn't a charter that > only > Lucene files could exist in the directory thus what is ideal for my > application (since I know I won't mingle other files) might not be > ideal for > all. Would it be fair to look for Lucene known extensions and file > naming > signatures to identify unused files that might be failed or dead > segments? > > > > Thanks, > > -George > >
package org.apache.lucene.index; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.FSDirectory; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Iterator; import java.io.File; /** * A tool that peeks into Lucene index directories and removes * unwanted files. In its more radical mode, this tool can be used to * remove all non-Lucene index files from a directory. The other * option is to remove unused Lucene segment files, should the index * directory get polluted. * * TODO: this tool should really lock the directory for writing before * removing any Lucene segment files, otherwise this tool itself may * corrupt the index. * * @author Otis Gospodnetic * @version $Id$ */ public class SegmentPurger { // TODO: copied from SegmentMerger - should probably made public // static final, to make it reusable // TODO: add .del extension // File extensions of old-style index files public static final String MULTIFILE_EXTENSIONS[] = new String[] { "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis" }; public static final String VECTOR_EXTENSIONS[] = new String[] { "tvx", "tvd", "tvf" }; public static final String COMPOUNDFILE_EXTENSIONS[] = new String[] { "cfs" }; public static final String INDEX_FILES[] = new String[] { "segments", "deletable" }; public static final String[][] SEGMENT_EXTENSIONS = new String[][] { MULTIFILE_EXTENSIONS, COMPOUNDFILE_EXTENSIONS, VECTOR_EXTENSIONS }; /** The file format version, a negative number. */ /* Works since counter, the old 1st entry, is always >= 0 */ public static final int FORMAT = -1; private int counter = 0; // used to name new segments private long version = 0; // counts how often the index has been changed by adding or deleting docs private FSDirectory directory; public SegmentPurger(FSDirectory directory) { this.directory = directory; } public void purgeNonLuceneFiles() throws IOException { String indexDirFiles[] = directory.list(); // loop through all files in dir for (int i = 0; i < indexDirFiles.length; i++) { //System.out.println("FILE: " + indexDirFiles[i]); if (!isLuceneDirectoryFile(indexDirFiles[i])) { System.out.println("Candidate non-Lucene file found: " + indexDirFiles[i]); // directory.deleteFile(indexDirFiles[i]); } } } public void purgeUnusedLuceneFiles() throws IOException { String indexDirFiles[] = directory.list(); // loop through all files in dir for (int i = 0; i < indexDirFiles.length; i++) { // if this is a Lucene file if (isLuceneDirectoryFile(indexDirFiles[i])) { // check if it's in the list of used segments if (!isUsed(indexDirFiles[i])) { System.out.println("Candidate unused Lucene file found: " + indexDirFiles[i]); // directory.deleteFile(indexDirFiles[i]); } } } } private List getSegmentInfos() throws IOException { List siList = new ArrayList(); IndexInput input = directory.openInput("segments"); try { int format = input.readInt(); if(format < 0){ // file contains explicit format info // check that it is a format we can understand if (format < FORMAT) throw new IOException("Unknown format version: " + format); version = input.readLong(); // read version counter = input.readInt(); // read counter } else{ // file is in old format without explicit format info counter = format; } for (int i = input.readInt(); i > 0; i--) { // read segmentInfos SegmentInfo si = new SegmentInfo(input.readString(), input.readInt(), directory); siList.add(si); } if(format >= 0){ // in old format the version number may be at the end of the file if (input.getFilePointer() >= input.length()) version = 0; // old file format without version number else version = input.readLong(); // read version } } finally { input.close(); } return siList; } private boolean isLuceneDirectoryFile(String fileName) { for (int i = 0; i < SEGMENT_EXTENSIONS.length; i++) { String[] EXTENSIONS = SEGMENT_EXTENSIONS[i]; for (int j = 0; j < EXTENSIONS.length; j++) { if (fileName.endsWith("." + EXTENSIONS[j])) return true; } } // TODO: also account for .fN files for (int i = 0; i < INDEX_FILES.length; i++) { if (fileName.equals(INDEX_FILES[i])) return true; } return false; } private boolean isUsed(String fileName) throws IOException { // these files are always used (e.g. segments, deletable) for (int i = 0; i < INDEX_FILES.length; i++) { if (fileName.equals(INDEX_FILES[i])) return true; } // split file name into base and extension, because we compare // file base name names of used segments String f = (new File(fileName)).getName(); String[] baseExt = f.split("\\.", 2); if (baseExt.length < 2) { System.err.println("Can't split file name into base and extension: " + fileName); return false; } List siList = getSegmentInfos(); // if the file base name matches a name of a used segment, the // file is considered used for (Iterator it = siList.iterator(); it.hasNext();) { SegmentInfo si = (SegmentInfo) it.next(); // System.out.println("FN: " + baseExt[0]); // System.out.println("SI: " + si.name); if (si.name.equals(baseExt[0])) return true; } return false; } public static void main(String[] args) throws IOException { FSDirectory directory = FSDirectory.getDirectory(args[0], false); SegmentPurger sp = new SegmentPurger(directory); sp.purgeNonLuceneFiles(); sp.purgeUnusedLuceneFiles(); } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]