Revision: 19559
http://sourceforge.net/p/gate/code/19559
Author: ian_roberts
Date: 2016-09-01 17:30:16 +0000 (Thu, 01 Sep 2016)
Log Message:
-----------
(Work in progress) utility to attempt to recover as much as possible from a
crashed index, when the zip collection was not closed properly and/or one or
more batches failed to sync to disk.
Added Paths:
-----------
mimir/trunk/mimir-core/src/gate/mimir/util/TruncateIndex.java
Added: mimir/trunk/mimir-core/src/gate/mimir/util/TruncateIndex.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/util/TruncateIndex.java
(rev 0)
+++ mimir/trunk/mimir-core/src/gate/mimir/util/TruncateIndex.java
2016-09-01 17:30:16 UTC (rev 19559)
@@ -0,0 +1,660 @@
+/*
+ * TruncateIndex.java
+ *
+ * Copyright (c) 2007-2016, The University of Sheffield.
+ *
+ * This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
+ * and is free software, licenced under the GNU Lesser General Public License,
+ * Version 3, June 2007 (also included with this distribution as file
+ * LICENCE-LGPL3.html).
+ *
+ * Ian Roberts, 1st September 2016
+ *
+ * $Id$
+ */
+package gate.mimir.util;
+
+import gate.mimir.index.AtomicIndex;
+import gate.mimir.index.DocumentCollection;
+import it.unimi.di.big.mg4j.index.CompressionFlags;
+import it.unimi.di.big.mg4j.index.DiskBasedIndex;
+import it.unimi.di.big.mg4j.index.Index;
+import it.unimi.di.big.mg4j.index.IndexIterator;
+import it.unimi.di.big.mg4j.index.IndexReader;
+import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
+import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
+import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
+import it.unimi.di.big.mg4j.io.IOFactory;
+import it.unimi.di.big.mg4j.tool.Scan;
+import it.unimi.dsi.big.io.FileLinesCollection;
+import it.unimi.dsi.bits.Fast;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
+import it.unimi.dsi.fastutil.longs.LongList;
+import it.unimi.dsi.io.InputBitStream;
+import it.unimi.dsi.io.OutputBitStream;
+import it.unimi.dsi.lang.MutableString;
+import it.unimi.dsi.util.BloomFilter;
+import it.unimi.dsi.util.Properties;
+
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Pattern;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipOutputStream;
+
+import org.apache.commons.configuration.ConfigurationException;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.output.ByteArrayOutputStream;
+import org.apache.log4j.Logger;
+
+/**
+ * Utility class to fix up a Mimir index that has been corrupted, e.g.
+ * by an unclean shutdown or out-of-memory condition. The index must be
+ * closed to use this tool, which either means the Mimir webapp is not
+ * running, or the index has been deleted from the running Mimir. It is
+ * very very strongly recommended to back up an index before attempting
+ * this procedure. The clean up process will unavoidably remove some
+ * number of documents from the tail of the index, but will attempt to
+ * keep the number of lost documents to a minimum.
+ *
+ * @author ian
+ *
+ */
+public class TruncateIndex {
+
+ private static final Logger log = Logger.getLogger(TruncateIndex.class);
+
+ /**
+ * Comparator that orders mimir zip collection files by number (e.g.
+ * mimir-collection-16.zip comes after mimir-collection-12-15.zip but
+ * before mimir-collection-100-120.zip)
+ */
+ public static final Comparator<File> ZIP_COLLECTION_COMPARATOR =
+ new Comparator<File>() {
+ public int compare(File a, File b) {
+ int numA =
+ Integer.parseInt(a.getName().substring(
+ a.getName().lastIndexOf('-') + 1,
+ a.getName().length() - 4));
+ int numB =
+ Integer.parseInt(b.getName().substring(
+ b.getName().lastIndexOf('-') + 1,
+ b.getName().length() - 4));
+ return numA - numB;
+ }
+ };
+
+ public static final Comparator<String> BATCH_COMPARATOR =
+ new Comparator<String>() {
+ public int compare(String a, String b) {
+ if(a.equals("head")) {
+ if(b.equals("head")) {
+ // both heads
+ return 0;
+ } else {
+ // head before tail
+ return -1;
+ }
+ } else {
+ if(b.equals("head")) {
+ // tail after head
+ return 1;
+ } else {
+ // both tails, compare by number
+ int numA =
+ Integer.parseInt(a.substring(a.lastIndexOf('-') +
1));
+ int numB =
+ Integer.parseInt(b.substring(b.lastIndexOf('-') +
1));
+ return numA - numB;
+ }
+ }
+ }
+ };
+
+ public static final FilenameFilter INDEX_NAME_FILTER = new FilenameFilter() {
+ private Pattern pat = Pattern.compile("(?:token|mention)-\\d+");
+
+ @Override
+ public boolean accept(File dir, String name) {
+ return pat.matcher(name).matches();
+ }
+ };
+
+ public static final FilenameFilter BATCH_NAME_FILTER = new FilenameFilter() {
+ private Pattern pat = Pattern.compile("head|tail-\\d+");
+
+ @Override
+ public boolean accept(File dir, String name) {
+ return pat.matcher(name).matches();
+ }
+ };
+
+ public static void main(String... args) throws Exception {
+ truncateIndex(new File(args[0]));
+ }
+
+ /**
+ * Attempt to fix up a corrupted Mimir index by truncating some number
+ * of documents off the end. There will be a certain number of
+ * documents in complete index batches, and a (possibly different)
+ * number of documents successfully persisted to disk in the zip files
+ * of the DocumentCollection, the index will be truncated to the
+ * smaller of those two numbers.
+ *
+ * @param indexDirectory the top-level directory of the Mimir index
+ * (containing config.xml)
+ */
+ public static void truncateIndex(File indexDirectory) throws Exception {
+ // 1. Repair the last zip file in the DocumentCollection
+ repairLastZip(indexDirectory);
+
+ // 2. Determine the last "good" batch (the greatest numbered head or
+ // tail that is fully written to disk in every AtomicIndex) and
+ // stash the bad ones
+ String lastGoodBatch = determineLastGoodBatch(indexDirectory);
+
+ if(lastGoodBatch == null) {
+ throw new RuntimeException(
+ "All batches are corrupt, sorry, this index is a write-off");
+ }
+
+ // 3. If the zip collection is at least as long as the sum of the
+ // good batches, truncate it to match the batches and we're done.
+ BatchDetails batches = batchEndPoints(indexDirectory);
+ long totalDocsInBatches = batches.endPoints[batches.endPoints.length - 1];
+ long totalDocsInZips = totalDocumentsInZipCollection(indexDirectory);
+
+ if(totalDocsInBatches == totalDocsInZips) {
+ log.info("We're in luck, the batches and zips line up exactly");
+ return;
+ } else if(totalDocsInZips > totalDocsInBatches) {
+ truncateZipCollectionTo(indexDirectory, totalDocsInBatches);
+ return;
+ } else if(totalDocsInZips == 0) {
+ throw new RuntimeException("Zip collection is empty");
+ }
+
+ // 4. Otherwise, the zip collection stops in the middle of a batch B
+ int endBatch = -1;
+ for(int i = 0; i < batches.names.length; i++) {
+ if(batches.endPoints[i] >= totalDocsInZips) {
+ endBatch = i;
+ break;
+ }
+ }
+ log.info("Zip collection ends within " + batches.names[endBatch]);
+ if(batches.endPoints[endBatch] == totalDocsInZips) {
+ // special case - zip collection ends exactly at the end of a
+ // batch. Stash subsequent batches and we're done
+ log.info("Zip collection ends exactly at the end of batch "
+ + batches.names[endBatch]);
+ log.info("Stashing subsequent batches");
+ stashBatches(indexDirectory, java.util.Arrays.asList(batches.names)
+ .subList(endBatch + 1, batches.endPoints.length));
+ log.info("Done");
+ return;
+ }
+ // 4.1. Stash B (for every AtomicIndex) and any batches beyond it.
+ stashBatches(indexDirectory, java.util.Arrays.asList(batches.names)
+ .subList(endBatch, batches.endPoints.length));
+
+ // 4.2. Read each stashed B and re-write it but with documents
+ // beyond the end of the zip collection omitted
+ long endOfPreviousBatch = 0L;
+ if(endBatch > 0) {
+ endOfPreviousBatch = batches.endPoints[endBatch - 1];
+ }
+ trimBatch(indexDirectory, batches.names[endBatch], totalDocsInZips
+ - endOfPreviousBatch);
+
+ // 4.3. Rebuild the direct indexes for those AtomicIndexes that
+ // require it
+ }
+
+ public static void repairLastZip(File indexDirectory) throws IOException {
+ log.info("Ensuring last zip file in " + indexDirectory.getAbsolutePath()
+ + " is complete");
+ File[] zipCollectionFiles =
+ indexDirectory
+
.listFiles(DocumentCollection.CollectionFile.FILENAME_FILTER);
+ if(zipCollectionFiles.length > 0) {
+ java.util.Arrays.sort(zipCollectionFiles, ZIP_COLLECTION_COMPARATOR);
+ File lastZip = zipCollectionFiles[zipCollectionFiles.length - 1];
+ log.info("Last zip is " + lastZip.getName());
+ File brokenBatches = new File(indexDirectory, "broken-batches");
+ brokenBatches.mkdirs();
+ File movedLastZip = new File(brokenBatches, lastZip.getName());
+ if(movedLastZip.exists()) {
+ movedLastZip.delete();
+ }
+ if(!lastZip.renameTo(movedLastZip)) {
+ throw new RuntimeException("Could not stash " + lastZip.getName()
+ + " in broken-batches");
+ }
+ log.debug("Moved " + lastZip.getName() + " to broken-batches");
+ String lastGoodDoc = null;
+ try(FileInputStream oldIn = new FileInputStream(movedLastZip);
+ ZipInputStream zipIn = new ZipInputStream(oldIn);
+ FileOutputStream newOut = new FileOutputStream(lastZip);
+ ZipOutputStream zipOut = new ZipOutputStream(newOut)) {
+ ZipEntry entry = null;
+ try {
+ while((entry = zipIn.getNextEntry()) != null) {
+ ByteArrayOutputStream data = new ByteArrayOutputStream();
+ IOUtils.copy(zipIn, data);
+ // if we get here the input zip was not truncated mid-entry,
+ // so it's safe to write this entry
+ zipOut.putNextEntry(entry);
+ IOUtils.write(data.toByteArray(), zipOut);
+ zipOut.closeEntry();
+ lastGoodDoc = entry.getName();
+ }
+ } catch(EOFException eof) {
+ // this is expected, if the zip was not properly closed
+ }
+ }
+ log.info("Last good document ID was " + lastGoodDoc);
+ } else {
+ log.warn("No files in zip collection");
+ }
+ }
+
+ /**
+ * Determines the last "good" batch name (head or tail-N) for the
+ * given index, and stashes any bad batches in the broken-batches
+ * directory.
+ *
+ * @param indexDirectory
+ * @return
+ * @throws IOException
+ */
+ public static String determineLastGoodBatch(File indexDirectory)
+ throws IOException {
+ String lastGood = null;
+
+ File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);
+ if(subIndexes.length == 0) {
+ throw new RuntimeException("Index has no AtomicIndexes!");
+ }
+ String[] batches = subIndexes[0].list(BATCH_NAME_FILTER);
+ java.util.Arrays.sort(batches, BATCH_COMPARATOR);
+ BATCH: for(String batch : batches) {
+ for(File subIndex : subIndexes) {
+ if(!new File(new File(subIndex, batch), subIndex.getName()
+ + ".properties").exists()) {
+ break BATCH;
+ }
+ }
+ // if we get to here we know this batch exists in all sub-indexes
+ lastGood = batch;
+ }
+
+ if(lastGood != null) {
+ File brokenBatches = new File(indexDirectory, "broken-batches");
+ // stash bad batches
+ for(File subIndex : subIndexes) {
+ File[] thisIndexBatches = subIndex.listFiles(BATCH_NAME_FILTER);
+ for(File b : thisIndexBatches) {
+ if(BATCH_COMPARATOR.compare(lastGood, b.getName()) < 0) {
+ // this is a bad batch, stash it
+ File movedB =
+ new File(brokenBatches, subIndex.getName() + "-"
+ + b.getName());
+ if(movedB.exists()) {
+ FileUtils.deleteDirectory(movedB);
+ }
+ if(!b.renameTo(movedB)) {
+ throw new RuntimeException("Could not stash " +
movedB.getName());
+ }
+ }
+ }
+ }
+ }
+
+ return lastGood;
+ }
+
+ public static class BatchDetails {
+ String[] names;
+
+ long[] endPoints;
+ }
+
+ public static BatchDetails batchEndPoints(File indexDirectory)
+ throws IOException, ConfigurationException {
+ BatchDetails details = new BatchDetails();
+ long totalDocs = 0;
+ File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);
+ if(subIndexes.length == 0) {
+ throw new RuntimeException("Index has no AtomicIndexes!");
+ }
+ details.names = subIndexes[0].list(BATCH_NAME_FILTER);
+ java.util.Arrays.sort(details.names, BATCH_COMPARATOR);
+
+ details.endPoints = new long[details.names.length];
+ for(int i = 0; i < details.names.length; i++) {
+ Properties batchProps = new Properties();
+ try(FileInputStream propsIn =
+ new FileInputStream(new File(new File(subIndexes[0],
+ details.names[i]), subIndexes[0].getName()
+ + ".properties"))) {
+ batchProps.load(propsIn);
+ }
+ totalDocs += batchProps.getLong("documents");
+ details.endPoints[i] = totalDocs;
+ }
+
+ return details;
+ }
+
+ public static long totalDocumentsInZipCollection(File indexDirectory)
+ throws IOException {
+ long totalDocs = 0;
+ File[] zipCollectionFiles =
+ indexDirectory
+
.listFiles(DocumentCollection.CollectionFile.FILENAME_FILTER);
+ for(File zip : zipCollectionFiles) {
+ try(ZipFile zf = new ZipFile(zip)) {
+ totalDocs += zf.size();
+ }
+ }
+
+ return totalDocs;
+ }
+
+ public static void truncateZipCollectionTo(File indexDirectory, long numDocs)
+ throws IOException {
+ File[] zipCollectionFiles =
+ indexDirectory
+
.listFiles(DocumentCollection.CollectionFile.FILENAME_FILTER);
+ java.util.Arrays.sort(zipCollectionFiles, ZIP_COLLECTION_COMPARATOR);
+ // the truncation point is somewhere within the last zip file whose
+ // first entry is less than numDocs (document IDs are zero based, so
+ // the document named numDocs is actually the (numDocs+1)th one).
+ int targetFile = -1;
+ for(int i = 0; i < zipCollectionFiles.length; i++) {
+ try(FileInputStream fis = new FileInputStream(zipCollectionFiles[i]);
+ ZipInputStream zipIn = new ZipInputStream(fis)) {
+ ZipEntry firstEntry = zipIn.getNextEntry();
+ if(firstEntry != null) {
+ long documentId = Long.parseLong(firstEntry.getName());
+ if(documentId >= numDocs) {
+ break;
+ } else {
+ targetFile = i;
+ }
+ }
+ }
+ }
+
+ if(targetFile < 0) {
+ throw new RuntimeException(
+ "Zip collection broken beyond repair - there is no zip file
containing the cut point");
+ }
+
+ // we know that document (numDocs-1) is somewhere in
+ // zipCollectionFiles[targetFile]. Move that file out of the way and
+ // rewrite it, truncated appropriately.
+ File origFile = zipCollectionFiles[targetFile];
+ File brokenBatches = new File(indexDirectory, "broken-batches");
+ brokenBatches.mkdirs();
+ File movedFile =
+ new File(brokenBatches, "to-truncate-" + origFile.getName());
+ if(movedFile.exists()) {
+ movedFile.delete();
+ }
+ if(!origFile.renameTo(movedFile)) {
+ throw new RuntimeException("Could not stash " + origFile.getName()
+ + " in broken-batches");
+ }
+ String lastEntryName = String.valueOf(numDocs - 1);
+ try(FileInputStream oldIn = new FileInputStream(movedFile);
+ ZipInputStream zipIn = new ZipInputStream(oldIn);
+ FileOutputStream newOut = new FileOutputStream(origFile);
+ ZipOutputStream zipOut = new ZipOutputStream(newOut)) {
+ ZipEntry entry = null;
+ try {
+ while((entry = zipIn.getNextEntry()) != null) {
+ ByteArrayOutputStream data = new ByteArrayOutputStream();
+ IOUtils.copy(zipIn, data);
+ // if we get here the input zip was not truncated mid-entry,
+ // so it's safe to write this entry
+ zipOut.putNextEntry(entry);
+ IOUtils.write(data.toByteArray(), zipOut);
+ zipOut.closeEntry();
+ if(lastEntryName.equals(entry.getName())) {
+ // reached the cut point, stop copying
+ break;
+ }
+ }
+ } catch(EOFException eof) {
+ // this is expected, if the zip was not properly closed
+ }
+ }
+ log.info("Truncated zip collection file " + origFile + " to document "
+ + lastEntryName);
+ }
+
+ public static void stashBatches(File indexDirectory, List<String> batches)
+ throws IOException {
+ File brokenBatches = new File(indexDirectory, "broken-batches");
+ File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);
+
+ for(File subIndex : subIndexes) {
+ for(String batchName : batches) {
+ File b = new File(subIndex, batchName);
+ File movedB =
+ new File(brokenBatches, subIndex.getName() + "-" + batchName);
+ if(movedB.exists()) {
+ FileUtils.deleteDirectory(movedB);
+ }
+ if(!b.renameTo(movedB)) {
+ throw new RuntimeException("Could not stash " + movedB.getName());
+ }
+ }
+ }
+ }
+
+ /**
+ * Trim the given batch in all sub-indexes to the given length in
+ * documents. Assumes the batch has already been stashed as
+ * broken-batches/subindex-batchName.
+ *
+ * @param indexDirectory top level index directory
+ * @param batchName name of the batch to trim
+ * @param numDocs number of documents to which the batch should be
+ * trimmed.
+ */
+ public static void trimBatch(File indexDirectory, String batchName,
+ long numDocs) throws Exception {
+ File brokenBatches = new File(indexDirectory, "broken-batches");
+ File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);
+
+ for(File subIndex : subIndexes) {
+ File stashedBatch =
+ new File(brokenBatches, subIndex.getName() + "-" + batchName);
+ if(!stashedBatch.exists()) {
+ throw new RuntimeException("Stashed batch " + stashedBatch
+ + " not found");
+ }
+ File batchDir = new File(subIndex, batchName);
+ batchDir.mkdirs();
+ log.info("Trimming batch " + batchDir);
+ String stashedIndexBasename =
+ new File(stashedBatch, subIndex.getName()).getAbsolutePath();
+ String outputIndexBasename =
+ new File(batchDir, subIndex.getName()).getAbsolutePath();
+
+ Index stashedIndex = Index.getInstance(stashedIndexBasename, true, true);
+
+ // when you read through an index sequentially, the IndexIterators
+ // don't tell you what term they were for, so we need to read the
+ // .terms file from the stashed batch in step with the index
+ // reader.
+ File stashedTermsFile =
+ new File(stashedIndexBasename + DiskBasedIndex.TERMS_EXTENSION);
+ FileLinesCollection termsColl =
+ new FileLinesCollection(stashedTermsFile.getAbsolutePath(),
+ "UTF-8");
+ long numTerms = termsColl.size64();
+ Iterator<MutableString> termsIter = termsColl.iterator();
+ File newTermsFile =
+ new File(outputIndexBasename + DiskBasedIndex.TERMS_EXTENSION);
+
+ // there will certainly be no *more* than numTerms terms in the
+ // final index, there may be fewer
+ BloomFilter<Void> termFilter = BloomFilter.create(Math.max(numTerms, 1));
+
+ Properties writerProperties = null;
+ long writtenBits = 0;
+ int maxDocSize = 0;
+ int maxCount = 0;
+ long totalOccurrences = 0;
+ try(IndexReader indexReader = stashedIndex.getReader();
+ FileOutputStream termsOS = new FileOutputStream(newTermsFile);
+ OutputStreamWriter termsOSW =
+ new OutputStreamWriter(termsOS, "UTF-8");
+ PrintWriter termsWriter = new PrintWriter(termsOSW)) {
+ QuasiSuccinctIndexWriter indexWriter =
+ new QuasiSuccinctIndexWriter(
+ IOFactory.FILESYSTEM_FACTORY,
+ outputIndexBasename,
+ numDocs,
+
Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
+ QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
+ CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX,
+ ByteOrder.nativeOrder());
+
+ IndexIterator iter;
+ while((iter = indexReader.nextIterator()) != null) {
+ MutableString term = termsIter.next();
+ // we can't stream the inverted list, because we need to know
+ // up front how many documents the term is found in so we can
+ // write that number before writing the positions.
+ LongList docPointers = new LongArrayList();
+ IntList counts = new IntArrayList();
+ List<IntArrayList> positions = new ArrayList<>();
+ long frequency = 0;
+ long curPointer;
+ long occurrences = 0;
+ long sumMaxPos = 0;
+ while((curPointer = iter.nextDocument()) !=
IndexIterator.END_OF_LIST) {
+ if(curPointer < numDocs) {
+ frequency++;
+ docPointers.add(curPointer);
+ counts.add(iter.count());
+ IntArrayList thisDocPositions = new IntArrayList(iter.count());
+ occurrences += iter.count();
+ totalOccurrences += iter.count();
+ if(iter.count() > maxCount) {
+ maxCount = iter.count();
+ }
+ int pos;
+ int lastPos = 0;
+ while((pos = iter.nextPosition()) !=
IndexIterator.END_OF_POSITIONS) {
+ thisDocPositions.add(pos);
+ lastPos = pos;
+ }
+ sumMaxPos += lastPos;
+ if(lastPos > maxDocSize) {
+ maxDocSize = lastPos;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if(frequency > 0) {
+ // this term occurred in at least one document that we're
+ // not truncating, so now we know it's safe to write the
+ // (truncated) inverted list to the new index and the term
+ // to the terms file.
+
+ term.println(termsWriter);
+ termFilter.add(term);
+
+ indexWriter.newInvertedList(frequency, occurrences, sumMaxPos);
+ indexWriter.writeFrequency(frequency);
+ for(int i = 0; i < frequency; i++) {
+ OutputBitStream obs = indexWriter.newDocumentRecord();
+ indexWriter.writeDocumentPointer(obs, docPointers.get(i));
+ indexWriter.writePositionCount(obs, counts.get(i));
+ indexWriter.writeDocumentPositions(obs, positions.get(i)
+ .elements(), 0, positions.get(i).size(), -1);
+ }
+ }
+ }
+
+ indexWriter.close();
+ writerProperties = indexWriter.properties();
+ // write stats file
+ try(PrintStream statsPs =
+ new PrintStream(new File(outputIndexBasename
+ + DiskBasedIndex.STATS_EXTENSION))) {
+ indexWriter.printStats(statsPs);
+ }
+ writtenBits = indexWriter.writtenBits();
+ }
+
+ // regenerate the term map from the (possibly shorter) terms file
+ AtomicIndex.generateTermMap(new File(outputIndexBasename
+ + DiskBasedIndex.TERMS_EXTENSION), new File(outputIndexBasename
+ + DiskBasedIndex.TERMMAP_EXTENSION), null);
+
+ // write the bloom filter
+ BinIO.storeObject(termFilter, new File(outputIndexBasename
+ + DocumentalCluster.BLOOM_EXTENSION));
+
+ // write the truncated sizes file
+ File stashedSizesFile =
+ new File(stashedIndexBasename + DiskBasedIndex.SIZES_EXTENSION);
+ InputBitStream stashedSizesStream = new InputBitStream(stashedSizesFile);
+ File sizesFile =
+ new File(outputIndexBasename + DiskBasedIndex.SIZES_EXTENSION);
+ OutputBitStream sizesStream = new OutputBitStream(sizesFile);
+ for(long i = 0; i < numDocs; i++) {
+ sizesStream.writeGamma(stashedSizesStream.readGamma());
+ }
+ sizesStream.close();
+
+ // generate the index properties
+ Properties stashedProps = new Properties();
+ try(FileInputStream stashedPropsStream =
+ new FileInputStream(stashedIndexBasename
+ + DiskBasedIndex.PROPERTIES_EXTENSION)) {
+ stashedProps.load(stashedPropsStream);
+ }
+ Properties newProps = new Properties();
+ newProps.setProperty(Index.PropertyKeys.TERMPROCESSOR,
+ stashedProps.getProperty(Index.PropertyKeys.TERMPROCESSOR));
+ newProps.setProperty(Index.PropertyKeys.SIZE, writtenBits);
+ // -1 means unknown
+ newProps.setProperty(Index.PropertyKeys.MAXDOCSIZE,
+ maxDocSize);
+ newProps.setProperty(Index.PropertyKeys.MAXCOUNT, maxCount);
+ newProps.setProperty(Index.PropertyKeys.OCCURRENCES,
+ totalOccurrences);
+ writerProperties.addAll(newProps);
+ Scan.saveProperties(IOFactory.FILESYSTEM_FACTORY, writerProperties,
+ outputIndexBasename + DiskBasedIndex.PROPERTIES_EXTENSION);
+ }
+ }
+}
Property changes on:
mimir/trunk/mimir-core/src/gate/mimir/util/TruncateIndex.java
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Id
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs