This is an automated email from the ASF dual-hosted git repository.
shauryachats pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 85e10a02f42 Fix AIOOB in Lucene text-index reuse path on retried
segment conversion (#18885)
85e10a02f42 is described below
commit 85e10a02f42c1f247fe81156c6af9c570e0d3d17
Author: Anurag Rai <[email protected]>
AuthorDate: Tue Jun 30 21:21:36 2026 +0530
Fix AIOOB in Lucene text-index reuse path on retried segment conversion
(#18885)
When the realtime -> immutable conversion path runs with reuseMutableIndex,
convertMutableSegment copies the mutable Lucene index to the v1 destination
and opens the writer with CREATE_OR_APPEND. If a prior conversion attempt
crashed or was killed mid-merge, the destination can hold leftover Lucene
segments that FileUtils.copyDirectory preserves. CREATE_OR_APPEND then opens
the highest segments_N file - which may reference the stale segments - and
the resulting Lucene index ends up with a different document count than the
surrounding Pinot segment. At query time DocIdTranslator's mapping buffer
(sized by the segment's numDocs) throws ArrayIndexOutOfBoundsException for
orphan Lucene docIDs.
Clean the destination directory before copying in both
LuceneTextIndexCreator
and MultiColumnLuceneTextIndexCreator. Add regression tests that prime the
destination with a force-merged stale index (its bumped segments_N counter
deterministically survives the copy) and assert both creators wipe and
rebuild
correctly.
---
.../creator/impl/text/LuceneTextIndexCreator.java | 7 +
.../text/MultiColumnLuceneTextIndexCreator.java | 7 +
.../MultiColumnLuceneTextIndexCreatorTest.java | 150 +++++++++++++++++++++
.../index/creator/LuceneTextIndexCreatorTest.java | 125 +++++++++++++++++
4 files changed, 289 insertions(+)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
index 72431dcc963..292b860b778 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
@@ -226,6 +226,13 @@ public class LuceneTextIndexCreator extends
AbstractTextIndexCreator {
// Copy the mutable index to the v1 index location
File dest = getV1TextIndexFile(segmentIndexDir);
File mutableDir = getMutableIndexDir(segmentIndexDir, consumerDir);
+ // Remove any stale Lucene files left over at the destination from a
prior crashed or killed
+ // conversion attempt. Otherwise FileUtils.copyDirectory preserves
dest-only files, and
+ // CREATE_OR_APPEND below folds those stale segments into the merged
index, inflating
+ // Lucene numDocs past the segment's totalDocs and causing AIOOB in
DocIdTranslator at query time.
+ if (dest.exists()) {
+ FileUtils.deleteDirectory(dest);
+ }
FileUtils.copyDirectory(mutableDir, dest);
// Remove the copied write.lock file
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
index 387805960bd..9193882c2e8 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
@@ -214,6 +214,13 @@ public class MultiColumnLuceneTextIndexCreator implements
Closeable {
// Copy the mutable index to the v1 index location
File dest = getV1TextIndexFile(segmentIndexDir);
File mutableDir = getMutableIndexDir(segmentIndexDir, consumerDir);
+ // Remove any stale Lucene files left over at the destination from a
prior crashed or killed
+ // conversion attempt. Otherwise FileUtils.copyDirectory preserves
dest-only files, and
+ // CREATE_OR_APPEND below folds those stale segments into the merged
index, inflating
+ // Lucene numDocs past the segment's totalDocs and causing AIOOB in
DocIdTranslator at query time.
+ if (dest.exists()) {
+ FileUtils.deleteDirectory(dest);
+ }
FileUtils.copyDirectory(mutableDir, dest);
// Remove the copied write.lock file
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreatorTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreatorTest.java
new file mode 100644
index 00000000000..2292bc11029
--- /dev/null
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreatorTest.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.creator.impl.text;
+
+import it.unimi.dsi.fastutil.booleans.BooleanArrayList;
+import it.unimi.dsi.fastutil.booleans.BooleanList;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.List;
+import java.util.Map;
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.pinot.segment.spi.V1Constants;
+import
org.apache.pinot.segment.spi.index.multicolumntext.MultiColumnTextIndexConstants;
+import org.apache.pinot.spi.config.table.FieldConfig;
+import org.apache.pinot.spi.config.table.MultiColumnTextIndexConfig;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+
+public class MultiColumnLuceneTextIndexCreatorTest {
+
+ @Test
+ public void testConvertMutableSegmentCleansStaleDestFiles()
+ throws Exception {
+ String column = "foo";
+ String segmentName = "regSegMc";
+ int numMutableDocs = 3;
+ int numStaleDocs = 10;
+
+ File scratch = Files.createTempDirectory(
+ "MultiColumnLuceneTextIndexCreatorTest_stale_dest_").toFile();
+ try {
+ // 1. Build the mutable multi-column Lucene index at the location
+ File consumerDir = new File(scratch, "consumerDir");
+ File mutableIndexDir = new File(new File(consumerDir, segmentName),
+ MultiColumnTextIndexConstants.INDEX_DIR_NAME
+ + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+ FileUtils.forceMkdir(mutableIndexDir);
+ writeLuceneDocs(mutableIndexDir, column, /* startDocId */ 0,
numMutableDocs, /* multiSegment */ false);
+
+ // 2. segmentIndexDir's parent must follow the "tmp-<segmentName>-<ts>"
pattern
+ File tmpSegmentParent = new File(scratch, "tmp-" + segmentName + "-1");
+ File segmentIndexDir = new File(tmpSegmentParent, segmentName);
+ FileUtils.forceMkdir(segmentIndexDir);
+
+ // 3. Pre-populate the v1 destination with a multi-segment force-merged
stale Lucene index
+ File destIndexDir = new File(segmentIndexDir,
+ MultiColumnTextIndexConstants.INDEX_DIR_NAME
+ + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+ FileUtils.forceMkdir(destIndexDir);
+ writeLuceneDocs(destIndexDir, column, /* startDocId */ 100,
numStaleDocs, /* multiSegment */ true);
+ File staleSentinel = new File(destIndexDir,
"stale_from_prior_attempt.bin");
+ FileUtils.writeByteArrayToFile(staleSentinel, new byte[]{0x01, 0x02,
0x03});
+
+ // 4. Drive the conversion path: commit=true && realtimeConversion=true
with reuseMutableIndex
+ // in the config triggers convertMutableSegment.
+ MultiColumnTextIndexConfig mcConfig = new MultiColumnTextIndexConfig(
+ List.of(column),
+ Map.of(FieldConfig.TEXT_INDEX_LUCENE_REUSE_MUTABLE_INDEX, "true"),
+ null);
+ BooleanList columnsSV = new BooleanArrayList(new boolean[]{true});
+ try (MultiColumnLuceneTextIndexCreator creator = new
MultiColumnLuceneTextIndexCreator(
+ List.of(column), columnsSV, segmentIndexDir, /* commit */ true,
+ /* realtimeConversion */ true, consumerDir, /*
immutableToMutableIdMap */ null,
+ mcConfig)) {
+ // Constructor performs the conversion; no add()/seal() on the reuse
path.
+ }
+
+ // 5a. Dest wipe: sentinel must be gone.
+ Assert.assertFalse(staleSentinel.exists(),
+ "Stale file at v1 destination must be removed before conversion to "
+ + "avoid CREATE_OR_APPEND picking up leftovers from a prior
attempt");
+
+ // 5b. Lucene's view of the destination must match the mutable input's
doc count, not the
+ // stale leftover's.
+ try (Directory destDir = FSDirectory.open(destIndexDir.toPath());
+ DirectoryReader reader = DirectoryReader.open(destDir)) {
+ Assert.assertEquals(reader.numDocs(), numMutableDocs,
+ "Converted Lucene index must contain exactly the mutable input's
document count, not "
+ + "stale leftovers from a prior attempt");
+ }
+
+ // 5c. DocId mapping file (shared multi-column form) must be sized for
the segment's numDocs.
+ File mappingFile = new File(segmentIndexDir,
MultiColumnTextIndexConstants.DOCID_MAPPING_FILE_NAME);
+ Assert.assertTrue(mappingFile.exists(),
+ "Shared multi-column docId mapping file should be built during
conversion");
+ Assert.assertEquals(mappingFile.length(), (long) Integer.BYTES *
numMutableDocs,
+ "DocId mapping file must be sized for the segment's numDocs only");
+ } finally {
+ FileUtils.deleteDirectory(scratch);
+ }
+ }
+
+ private static void writeLuceneDocs(File indexDir, String column, int
startDocId, int count,
+ boolean multiSegment)
+ throws IOException {
+ try (Directory directory = FSDirectory.open(indexDir.toPath());
+ IndexWriter writer = new IndexWriter(directory,
+ new IndexWriterConfig(new StandardAnalyzer())
+ .setOpenMode(IndexWriterConfig.OpenMode.CREATE))) {
+ int splitAt = multiSegment ? Math.max(1, count / 2) : count;
+ for (int i = 0; i < splitAt; i++) {
+ writer.addDocument(buildDoc(column, startDocId + i));
+ }
+ if (multiSegment) {
+ writer.commit();
+ for (int i = splitAt; i < count; i++) {
+ writer.addDocument(buildDoc(column, startDocId + i));
+ }
+ writer.commit();
+ writer.forceMerge(1, true);
+ }
+ writer.commit();
+ }
+ }
+
+ private static Document buildDoc(String column, int docId) {
+ Document doc = new Document();
+ doc.add(new TextField(column, "doc " + docId, Field.Store.NO));
+ doc.add(new
StoredField(MultiColumnLuceneTextIndexCreator.LUCENE_INDEX_DOC_ID_COLUMN_NAME,
docId));
+ return doc;
+ }
+}
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
index 83b125b2951..b4444f3d959 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
@@ -20,11 +20,23 @@ package
org.apache.pinot.segment.local.segment.index.creator;
import java.io.File;
import java.io.IOException;
+import java.nio.file.Files;
import java.util.HashMap;
import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
import
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
import
org.apache.pinot.segment.local.segment.index.readers.text.LuceneTextIndexReader;
import
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
+import org.apache.pinot.segment.spi.V1Constants;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
@@ -104,4 +116,117 @@ public class LuceneTextIndexCreatorTest {
Assert.assertEquals(matchedDocIds[1], 2);
}
}
+
+ /**
+ * Regression test for the AIOOB triggered when a prior segment-conversion
attempt left stale
+ * Lucene files at the v1 destination directory and the reuse-mutable-index
path is taken.
+ */
+ @Test
+ public void testConvertMutableSegmentCleansStaleDestFiles()
+ throws Exception {
+ String column = "foo";
+ String segmentName = "regSeg";
+ int numMutableDocs = 3;
+ int numStaleDocs = 10;
+
+ File scratch = Files.createTempDirectory(
+ "LuceneTextIndexCreatorTest_stale_dest_").toFile();
+ try {
+ // 1. Build the mutable Lucene index
+ File consumerDir = new File(scratch, "consumerDir");
+ File mutableIndexDir = new File(new File(consumerDir, segmentName),
+ column + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+ FileUtils.forceMkdir(mutableIndexDir);
+ writeLuceneDocs(mutableIndexDir, /* startDocId */ 0, numMutableDocs, /*
multiSegment */ false);
+
+ // 2. segmentIndexDir's parent
+ File tmpSegmentParent = new File(scratch, "tmp-" + segmentName + "-1");
+ File segmentIndexDir = new File(tmpSegmentParent, segmentName);
+ FileUtils.forceMkdir(segmentIndexDir);
+
+ // 3. Pre-populate the v1 destination with stale Lucene files that
simulate a prior crashed
+ // or killed conversion
+ File destIndexDir = new File(segmentIndexDir,
+ column + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+ FileUtils.forceMkdir(destIndexDir);
+ writeLuceneDocs(destIndexDir, /* startDocId */ 100, numStaleDocs, /*
multiSegment */ true);
+
+ File staleSentinel = new File(destIndexDir,
"stale_from_prior_attempt.bin");
+ FileUtils.writeByteArrayToFile(staleSentinel, new byte[]{0x01, 0x02,
0x03});
+
+ // 4. commit=true && realtimeConversion=true && reuseMutableIndex=true
triggers
+ // convertMutableSegment.
+ TextIndexConfig config = new
TextIndexConfigBuilder().withReuseMutableIndex(true).build();
+ try (LuceneTextIndexCreator creator = new LuceneTextIndexCreator(column,
segmentIndexDir,
+ /* commit */ true, /* realtimeConversion */ true, consumerDir,
+ /* mutableSegmentCompacted */ false, /* immutableToMutableIdMap */
null,
+ /* combineAndCleanupFiles */ false, config)) {
+ // Constructor performs the conversion; no add()/seal() on the reuse
path.
+ }
+
+ // 5a. Dest wipe: sentinel must be gone.
+ Assert.assertFalse(staleSentinel.exists(),
+ "Stale file at v1 destination must be removed before conversion to "
+ + "avoid CREATE_OR_APPEND picking up leftovers from a prior
attempt");
+
+ // 5b. Lucene's view of the destination must match the mutable input's
doc count, not the
+ // stale leftover's. Pre-fix, Lucene opens stale's higher-N segments
file and reports
+ // numStaleDocs instead.
+ try (Directory destDir = FSDirectory.open(destIndexDir.toPath());
+ DirectoryReader reader = DirectoryReader.open(destDir)) {
+ Assert.assertEquals(reader.numDocs(), numMutableDocs,
+ "Converted Lucene index must contain exactly the mutable input's
document count, not "
+ + "stale leftovers from a prior attempt");
+ }
+
+ // 5c. DocId mapping file is sized by Lucene's numDocs at write time and
remapped by the
+ // segment's numDocs at read time - a mismatch here is what produced
AIOOB in production.
+ File mappingFile = new File(segmentIndexDir,
+ column +
V1Constants.Indexes.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+ Assert.assertTrue(mappingFile.exists(), "DocId mapping file should be
built during conversion");
+ Assert.assertEquals(mappingFile.length(), (long) Integer.BYTES *
numMutableDocs,
+ "DocId mapping file must be sized for the segment's numDocs only");
+
+ // 5d. End-to-end read
+ try (LuceneTextIndexReader reader = new LuceneTextIndexReader(column,
segmentIndexDir,
+ numMutableDocs, config)) {
+ int[] matched = reader.getDocIds("doc").toArray();
+ for (int docId : matched) {
+ Assert.assertTrue(docId >= 0 && docId < numMutableDocs,
+ "Returned Pinot docId " + docId + " must be within [0, " +
numMutableDocs + ")");
+ }
+ }
+ } finally {
+ FileUtils.deleteDirectory(scratch);
+ }
+ }
+
+ private static void writeLuceneDocs(File indexDir, int startDocId, int
count, boolean multiSegment)
+ throws IOException {
+ try (Directory directory = FSDirectory.open(indexDir.toPath());
+ IndexWriter writer = new IndexWriter(directory,
+ new IndexWriterConfig(new StandardAnalyzer())
+ .setOpenMode(IndexWriterConfig.OpenMode.CREATE))) {
+ int splitAt = multiSegment ? Math.max(1, count / 2) : count;
+ for (int i = 0; i < splitAt; i++) {
+ writer.addDocument(buildDoc(startDocId + i));
+ }
+ if (multiSegment) {
+ writer.commit();
+ for (int i = splitAt; i < count; i++) {
+ writer.addDocument(buildDoc(startDocId + i));
+ }
+ writer.commit();
+ writer.forceMerge(1, true);
+ }
+ writer.commit();
+ }
+ }
+
+ private static Document buildDoc(int docId) {
+ Document doc = new Document();
+ doc.add(new TextField("foo", "doc " + docId, Field.Store.NO));
+ doc.add(new
StoredField(LuceneTextIndexCreator.LUCENE_INDEX_DOC_ID_COLUMN_NAME, docId));
+ return doc;
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]