package org.apache.lucene.index; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * . */ import java.io.IOException; import java.io.File; import java.io.PrintStream; import java.util.Vector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Lock; import org.apache.lucene.store.InputStream; import org.apache.lucene.store.OutputStream; import org.apache.lucene.document.Document; import org.apache.lucene.analysis.Analyzer; /** An IndexWriter creates and maintains an index. The third argument to the constructor determines whether a new index is created, or whether an existing index is opened for the addition of new documents. In either case, documents are added with the addDocument method. When finished adding documents, close should be called. If an index will not have more documents added for a while and optimal search performance is desired, then the optimize method should be called before the index is closed. */ public final class IndexWriter { private Directory directory; // where this index resides private Analyzer analyzer; // how to analyze text private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private final Directory ramDirectory = new RAMDirectory(); // for temp segs /** Constructs an IndexWriter for the index in path. Text will be analyzed with a. If create is true, then a new, empty index will be created in path, replacing the index already there, if any. */ public IndexWriter(String path, Analyzer a, boolean create) throws IOException { this(FSDirectory.getDirectory(path, create), a, create); } /** Constructs an IndexWriter for the index in path. Text will be analyzed with a. If create is true, then a new, empty index will be created in path, replacing the index already there, if any. */ public IndexWriter(File path, Analyzer a, boolean create) throws IOException { this(FSDirectory.getDirectory(path, create), a, create); } /** Constructs an IndexWriter for the index in d. Text will be analyzed with a. If create is true, then a new, empty index will be created in d, replacing the index already there, if any. */ public IndexWriter(Directory d, Analyzer a, final boolean create) throws IOException { directory = d; analyzer = a; Lock writeLock = directory.makeLock("write.lock"); if (!writeLock.obtain()) // obtain write lock throw new IOException("Index locked for write: " + writeLock); synchronized (directory) { // in- & inter-process sync new Lock.With(directory.makeLock("commit.lock")) { public Object doBody() throws IOException { if (create) segmentInfos.write(directory); else segmentInfos.read(directory); return null; } }.run(); } } /** Flushes all changes to an index, closes all associated files, and closes the directory that the index is stored in. */ public final synchronized void close() throws IOException { flushRamSegments(); ramDirectory.close(); directory.makeLock("write.lock").release(); // release write lock directory.close(); } /** Returns the number of documents currently in this index. */ public final synchronized int docCount() { int count = 0; for (int i = 0; i < segmentInfos.size(); i++) { SegmentInfo si = segmentInfos.info(i); count += si.docCount; } return count; } /** The maximum number of terms that will be indexed for a single field in a document. This limits the amount of memory required for indexing, so that collections with very large files will not crash the indexing process by running out of memory.

By default, no more than 10,000 terms will be indexed for a field. */ public int maxFieldLength = 10000; /** Adds a document to this index.*/ public final void addDocument(Document doc) throws IOException { DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, maxFieldLength); String segmentName = newSegmentName(); dw.addDocument(segmentName, doc); synchronized (this) { segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory)); maybeMergeSegments(); } } private final synchronized String newSegmentName() { return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); } /** Determines how often segment indexes are merged by addDocument(). With * smaller values, less RAM is used while indexing, and searches on * unoptimized indexes are faster, but indexing speed is slower. With larger * values more RAM is used while indexing and searches on unoptimized indexes * are slower, but indexing is faster. Thus larger values (> 10) are best * for batched index creation, and smaller values (< 10) for indexes that are * interactively maintained. * *

This must never be less than 2. The default value is 10.*/ public int mergeFactor = 10; /** Determines the largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * *

The default value is {@link Integer#MAX_VALUE}. */ public int maxMergeDocs = Integer.MAX_VALUE; /** If non-null, information about merges will be printed to this. */ public PrintStream infoStream = null; /** Merges all segments together into a single segment, optimizing an index for search. */ public final synchronized void optimize() throws IOException { flushRamSegments(); while (segmentInfos.size() > 1 || (segmentInfos.size() == 1 && (SegmentReader.hasDeletions(segmentInfos.info(0)) || segmentInfos.info(0).dir != directory))) { int minSegment = segmentInfos.size() - mergeFactor; mergeSegments(minSegment < 0 ? 0 : minSegment); } } /** Merges all segments from an array of indexes into this index. * *

This may be used to parallelize batch indexing. A large document * collection can be broken into sub-collections. Each sub-collection can be * indexed in parallel, on a different thread, process or machine. The * complete index can then be created by merging sub-collection indexes * with this method. * *

After this completes, the index is optimized. */ public final synchronized void addIndexes(Directory[] dirs) throws IOException { optimize(); // start with zero or 1 seg for (int i = 0; i < dirs.length; i++) { SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.read(dirs[i]); for (int j = 0; j < sis.size(); j++) { segmentInfos.addElement(sis.info(j)); // add each info } } optimize(); // final cleanup } /** Merges all RAM-resident segments. */ private final void flushRamSegments() throws IOException { int minSegment = segmentInfos.size()-1; int docCount = 0; while (minSegment >= 0 && (segmentInfos.info(minSegment)).dir == ramDirectory) { docCount += segmentInfos.info(minSegment).docCount; minSegment--; } if (minSegment < 0 || // add one FS segment? (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor || !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory)) minSegment++; if (minSegment >= segmentInfos.size()) return; // none to merge mergeSegments(minSegment); } /** Incremental segment merger. */ private final void maybeMergeSegments() throws IOException { long targetMergeDocs = mergeFactor; while (targetMergeDocs <= maxMergeDocs) { // find segments smaller than current target size int minSegment = segmentInfos.size(); int mergeDocs = 0; while (--minSegment >= 0) { SegmentInfo si = segmentInfos.info(minSegment); if (si.docCount >= targetMergeDocs) break; mergeDocs += si.docCount; } if (mergeDocs >= targetMergeDocs) // found a merge to do mergeSegments(minSegment+1); else break; targetMergeDocs *= mergeFactor; // increase target size } } /** Pops segments off of segmentInfos stack down to minSegment, merges them, and pushes the merged index onto the top of the segmentInfos stack. */ private final void mergeSegments(int minSegment) throws IOException { String mergedName = newSegmentName(); int mergedDocCount = 0; if (infoStream != null) infoStream.print("merging segments"); SegmentMerger merger = new SegmentMerger(directory, mergedName); final Vector segmentsToDelete = new Vector(); for (int i = minSegment; i < segmentInfos.size(); i++) { SegmentInfo si = segmentInfos.info(i); if (infoStream != null) infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); SegmentReader reader = new SegmentReader(si); merger.add(reader); if ((reader.directory == this.directory) || // if we own the directory (reader.directory == this.ramDirectory)) segmentsToDelete.addElement(reader); // queue segment for deletion mergedDocCount += si.docCount; } if (infoStream != null) { infoStream.println(); infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)"); } merger.merge(); segmentInfos.setSize(minSegment); // pop old infos & add new segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount, directory)); synchronized (directory) { // in- & inter-process sync new Lock.With(directory.makeLock("commit.lock")) { public Object doBody() throws IOException { segmentInfos.write(directory); // commit before deleting deleteSegments(segmentsToDelete); // delete now-unused segments return null; } }.run(); } } /* Some operating systems (e.g. Windows) don't permit a file to be deleted while it is opened for read (e.g. by another process or thread). So we assume that when a delete fails it is because the file is open in another process, and queue the file for subsequent deletion. */ private final void deleteSegments(Vector segments) throws IOException { Vector deletable = new Vector(); deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable for (int i = 0; i < segments.size(); i++) { SegmentReader reader = (SegmentReader)segments.elementAt(i); if (reader.directory == this.directory) deleteFiles(reader.files(), deletable); // try to delete our files else deleteFiles(reader.files(), reader.directory); // delete, eg, RAM files } writeDeleteableFiles(deletable); // note files we can't delete } private final void deleteFiles(Vector files, Directory directory) throws IOException { for (int i = 0; i < files.size(); i++) directory.deleteFile((String)files.elementAt(i)); } private final void deleteFiles(Vector files, Vector deletable) throws IOException { for (int i = 0; i < files.size(); i++) { String file = (String)files.elementAt(i); try { directory.deleteFile(file); // try to delete each file } catch (IOException e) { // if delete fails if (directory.fileExists(file)) { if (infoStream != null) infoStream.println(e.getMessage() + "; Will re-try later."); deletable.addElement(file); // add to deletable } } } } private final Vector readDeleteableFiles() throws IOException { Vector result = new Vector(); if (!directory.fileExists("deletable")) return result; InputStream input = directory.openFile("deletable"); try { for (int i = input.readInt(); i > 0; i--) // read file names result.addElement(input.readString()); } finally { input.close(); } return result; } private final void writeDeleteableFiles(Vector files) throws IOException { OutputStream output = directory.createFile("deleteable.new"); try { output.writeInt(files.size()); for (int i = 0; i < files.size(); i++) output.writeString((String)files.elementAt(i)); } finally { output.close(); } directory.renameFile("deleteable.new", "deletable"); } }