This is an automated email from the ASF dual-hosted git repository.

shauryachats pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 85e10a02f42 Fix AIOOB in Lucene text-index reuse path on retried 
segment conversion (#18885)
85e10a02f42 is described below

commit 85e10a02f42c1f247fe81156c6af9c570e0d3d17
Author: Anurag Rai <[email protected]>
AuthorDate: Tue Jun 30 21:21:36 2026 +0530

    Fix AIOOB in Lucene text-index reuse path on retried segment conversion 
(#18885)
    
    When the realtime -> immutable conversion path runs with reuseMutableIndex,
    convertMutableSegment copies the mutable Lucene index to the v1 destination
    and opens the writer with CREATE_OR_APPEND. If a prior conversion attempt
    crashed or was killed mid-merge, the destination can hold leftover Lucene
    segments that FileUtils.copyDirectory preserves. CREATE_OR_APPEND then opens
    the highest segments_N file - which may reference the stale segments - and
    the resulting Lucene index ends up with a different document count than the
    surrounding Pinot segment. At query time DocIdTranslator's mapping buffer
    (sized by the segment's numDocs) throws ArrayIndexOutOfBoundsException for
    orphan Lucene docIDs.
    
    Clean the destination directory before copying in both 
LuceneTextIndexCreator
    and MultiColumnLuceneTextIndexCreator. Add regression tests that prime the
    destination with a force-merged stale index (its bumped segments_N counter
    deterministically survives the copy) and assert both creators wipe and 
rebuild
    correctly.
---
 .../creator/impl/text/LuceneTextIndexCreator.java  |   7 +
 .../text/MultiColumnLuceneTextIndexCreator.java    |   7 +
 .../MultiColumnLuceneTextIndexCreatorTest.java     | 150 +++++++++++++++++++++
 .../index/creator/LuceneTextIndexCreatorTest.java  | 125 +++++++++++++++++
 4 files changed, 289 insertions(+)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
index 72431dcc963..292b860b778 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
@@ -226,6 +226,13 @@ public class LuceneTextIndexCreator extends 
AbstractTextIndexCreator {
       // Copy the mutable index to the v1 index location
       File dest = getV1TextIndexFile(segmentIndexDir);
       File mutableDir = getMutableIndexDir(segmentIndexDir, consumerDir);
+      // Remove any stale Lucene files left over at the destination from a 
prior crashed or killed
+      // conversion attempt. Otherwise FileUtils.copyDirectory preserves 
dest-only files, and
+      // CREATE_OR_APPEND below folds those stale segments into the merged 
index, inflating
+      // Lucene numDocs past the segment's totalDocs and causing AIOOB in 
DocIdTranslator at query time.
+      if (dest.exists()) {
+        FileUtils.deleteDirectory(dest);
+      }
       FileUtils.copyDirectory(mutableDir, dest);
 
       // Remove the copied write.lock file
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
index 387805960bd..9193882c2e8 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreator.java
@@ -214,6 +214,13 @@ public class MultiColumnLuceneTextIndexCreator implements 
Closeable {
       // Copy the mutable index to the v1 index location
       File dest = getV1TextIndexFile(segmentIndexDir);
       File mutableDir = getMutableIndexDir(segmentIndexDir, consumerDir);
+      // Remove any stale Lucene files left over at the destination from a 
prior crashed or killed
+      // conversion attempt. Otherwise FileUtils.copyDirectory preserves 
dest-only files, and
+      // CREATE_OR_APPEND below folds those stale segments into the merged 
index, inflating
+      // Lucene numDocs past the segment's totalDocs and causing AIOOB in 
DocIdTranslator at query time.
+      if (dest.exists()) {
+        FileUtils.deleteDirectory(dest);
+      }
       FileUtils.copyDirectory(mutableDir, dest);
 
       // Remove the copied write.lock file
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreatorTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreatorTest.java
new file mode 100644
index 00000000000..2292bc11029
--- /dev/null
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/text/MultiColumnLuceneTextIndexCreatorTest.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.creator.impl.text;
+
+import it.unimi.dsi.fastutil.booleans.BooleanArrayList;
+import it.unimi.dsi.fastutil.booleans.BooleanList;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.List;
+import java.util.Map;
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.pinot.segment.spi.V1Constants;
+import 
org.apache.pinot.segment.spi.index.multicolumntext.MultiColumnTextIndexConstants;
+import org.apache.pinot.spi.config.table.FieldConfig;
+import org.apache.pinot.spi.config.table.MultiColumnTextIndexConfig;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+
+public class MultiColumnLuceneTextIndexCreatorTest {
+
+  @Test
+  public void testConvertMutableSegmentCleansStaleDestFiles()
+      throws Exception {
+    String column = "foo";
+    String segmentName = "regSegMc";
+    int numMutableDocs = 3;
+    int numStaleDocs = 10;
+
+    File scratch = Files.createTempDirectory(
+        "MultiColumnLuceneTextIndexCreatorTest_stale_dest_").toFile();
+    try {
+      // 1. Build the mutable multi-column Lucene index at the location
+      File consumerDir = new File(scratch, "consumerDir");
+      File mutableIndexDir = new File(new File(consumerDir, segmentName),
+          MultiColumnTextIndexConstants.INDEX_DIR_NAME
+              + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+      FileUtils.forceMkdir(mutableIndexDir);
+      writeLuceneDocs(mutableIndexDir, column, /* startDocId */ 0, 
numMutableDocs, /* multiSegment */ false);
+
+      // 2. segmentIndexDir's parent must follow the "tmp-<segmentName>-<ts>" 
pattern
+      File tmpSegmentParent = new File(scratch, "tmp-" + segmentName + "-1");
+      File segmentIndexDir = new File(tmpSegmentParent, segmentName);
+      FileUtils.forceMkdir(segmentIndexDir);
+
+      // 3. Pre-populate the v1 destination with a multi-segment force-merged 
stale Lucene index
+      File destIndexDir = new File(segmentIndexDir,
+          MultiColumnTextIndexConstants.INDEX_DIR_NAME
+              + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+      FileUtils.forceMkdir(destIndexDir);
+      writeLuceneDocs(destIndexDir, column, /* startDocId */ 100, 
numStaleDocs, /* multiSegment */ true);
+      File staleSentinel = new File(destIndexDir, 
"stale_from_prior_attempt.bin");
+      FileUtils.writeByteArrayToFile(staleSentinel, new byte[]{0x01, 0x02, 
0x03});
+
+      // 4. Drive the conversion path: commit=true && realtimeConversion=true 
with reuseMutableIndex
+      //    in the config triggers convertMutableSegment.
+      MultiColumnTextIndexConfig mcConfig = new MultiColumnTextIndexConfig(
+          List.of(column),
+          Map.of(FieldConfig.TEXT_INDEX_LUCENE_REUSE_MUTABLE_INDEX, "true"),
+          null);
+      BooleanList columnsSV = new BooleanArrayList(new boolean[]{true});
+      try (MultiColumnLuceneTextIndexCreator creator = new 
MultiColumnLuceneTextIndexCreator(
+          List.of(column), columnsSV, segmentIndexDir, /* commit */ true,
+          /* realtimeConversion */ true, consumerDir, /* 
immutableToMutableIdMap */ null,
+          mcConfig)) {
+        // Constructor performs the conversion; no add()/seal() on the reuse 
path.
+      }
+
+      // 5a. Dest wipe: sentinel must be gone.
+      Assert.assertFalse(staleSentinel.exists(),
+          "Stale file at v1 destination must be removed before conversion to "
+              + "avoid CREATE_OR_APPEND picking up leftovers from a prior 
attempt");
+
+      // 5b. Lucene's view of the destination must match the mutable input's 
doc count, not the
+      //     stale leftover's.
+      try (Directory destDir = FSDirectory.open(destIndexDir.toPath());
+          DirectoryReader reader = DirectoryReader.open(destDir)) {
+        Assert.assertEquals(reader.numDocs(), numMutableDocs,
+            "Converted Lucene index must contain exactly the mutable input's 
document count, not "
+                + "stale leftovers from a prior attempt");
+      }
+
+      // 5c. DocId mapping file (shared multi-column form) must be sized for 
the segment's numDocs.
+      File mappingFile = new File(segmentIndexDir, 
MultiColumnTextIndexConstants.DOCID_MAPPING_FILE_NAME);
+      Assert.assertTrue(mappingFile.exists(),
+          "Shared multi-column docId mapping file should be built during 
conversion");
+      Assert.assertEquals(mappingFile.length(), (long) Integer.BYTES * 
numMutableDocs,
+          "DocId mapping file must be sized for the segment's numDocs only");
+    } finally {
+      FileUtils.deleteDirectory(scratch);
+    }
+  }
+
+  private static void writeLuceneDocs(File indexDir, String column, int 
startDocId, int count,
+      boolean multiSegment)
+      throws IOException {
+    try (Directory directory = FSDirectory.open(indexDir.toPath());
+        IndexWriter writer = new IndexWriter(directory,
+            new IndexWriterConfig(new StandardAnalyzer())
+                .setOpenMode(IndexWriterConfig.OpenMode.CREATE))) {
+      int splitAt = multiSegment ? Math.max(1, count / 2) : count;
+      for (int i = 0; i < splitAt; i++) {
+        writer.addDocument(buildDoc(column, startDocId + i));
+      }
+      if (multiSegment) {
+        writer.commit();
+        for (int i = splitAt; i < count; i++) {
+          writer.addDocument(buildDoc(column, startDocId + i));
+        }
+        writer.commit();
+        writer.forceMerge(1, true);
+      }
+      writer.commit();
+    }
+  }
+
+  private static Document buildDoc(String column, int docId) {
+    Document doc = new Document();
+    doc.add(new TextField(column, "doc " + docId, Field.Store.NO));
+    doc.add(new 
StoredField(MultiColumnLuceneTextIndexCreator.LUCENE_INDEX_DOC_ID_COLUMN_NAME, 
docId));
+    return doc;
+  }
+}
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
index 83b125b2951..b4444f3d959 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/LuceneTextIndexCreatorTest.java
@@ -20,11 +20,23 @@ package 
org.apache.pinot.segment.local.segment.index.creator;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Files;
 import java.util.HashMap;
 import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
 import 
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
 import 
org.apache.pinot.segment.local.segment.index.readers.text.LuceneTextIndexReader;
 import 
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
+import org.apache.pinot.segment.spi.V1Constants;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
 import org.testng.Assert;
 import org.testng.annotations.AfterMethod;
@@ -104,4 +116,117 @@ public class LuceneTextIndexCreatorTest {
       Assert.assertEquals(matchedDocIds[1], 2);
     }
   }
+
+  /**
+   * Regression test for the AIOOB triggered when a prior segment-conversion 
attempt left stale
+   * Lucene files at the v1 destination directory and the reuse-mutable-index 
path is taken.
+   */
+  @Test
+  public void testConvertMutableSegmentCleansStaleDestFiles()
+      throws Exception {
+    String column = "foo";
+    String segmentName = "regSeg";
+    int numMutableDocs = 3;
+    int numStaleDocs = 10;
+
+    File scratch = Files.createTempDirectory(
+        "LuceneTextIndexCreatorTest_stale_dest_").toFile();
+    try {
+      // 1. Build the mutable Lucene index
+      File consumerDir = new File(scratch, "consumerDir");
+      File mutableIndexDir = new File(new File(consumerDir, segmentName),
+          column + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+      FileUtils.forceMkdir(mutableIndexDir);
+      writeLuceneDocs(mutableIndexDir, /* startDocId */ 0, numMutableDocs, /* 
multiSegment */ false);
+
+      // 2. segmentIndexDir's parent
+      File tmpSegmentParent = new File(scratch, "tmp-" + segmentName + "-1");
+      File segmentIndexDir = new File(tmpSegmentParent, segmentName);
+      FileUtils.forceMkdir(segmentIndexDir);
+
+      // 3. Pre-populate the v1 destination with stale Lucene files that 
simulate a prior crashed
+      //    or killed conversion
+      File destIndexDir = new File(segmentIndexDir,
+          column + V1Constants.Indexes.LUCENE_V912_TEXT_INDEX_FILE_EXTENSION);
+      FileUtils.forceMkdir(destIndexDir);
+      writeLuceneDocs(destIndexDir, /* startDocId */ 100, numStaleDocs, /* 
multiSegment */ true);
+
+      File staleSentinel = new File(destIndexDir, 
"stale_from_prior_attempt.bin");
+      FileUtils.writeByteArrayToFile(staleSentinel, new byte[]{0x01, 0x02, 
0x03});
+
+      // 4. commit=true && realtimeConversion=true && reuseMutableIndex=true 
triggers
+      //    convertMutableSegment.
+      TextIndexConfig config = new 
TextIndexConfigBuilder().withReuseMutableIndex(true).build();
+      try (LuceneTextIndexCreator creator = new LuceneTextIndexCreator(column, 
segmentIndexDir,
+          /* commit */ true, /* realtimeConversion */ true, consumerDir,
+          /* mutableSegmentCompacted */ false, /* immutableToMutableIdMap */ 
null,
+          /* combineAndCleanupFiles */ false, config)) {
+        // Constructor performs the conversion; no add()/seal() on the reuse 
path.
+      }
+
+      // 5a. Dest wipe: sentinel must be gone.
+      Assert.assertFalse(staleSentinel.exists(),
+          "Stale file at v1 destination must be removed before conversion to "
+              + "avoid CREATE_OR_APPEND picking up leftovers from a prior 
attempt");
+
+      // 5b. Lucene's view of the destination must match the mutable input's 
doc count, not the
+      //     stale leftover's. Pre-fix, Lucene opens stale's higher-N segments 
file and reports
+      //     numStaleDocs instead.
+      try (Directory destDir = FSDirectory.open(destIndexDir.toPath());
+          DirectoryReader reader = DirectoryReader.open(destDir)) {
+        Assert.assertEquals(reader.numDocs(), numMutableDocs,
+            "Converted Lucene index must contain exactly the mutable input's 
document count, not "
+                + "stale leftovers from a prior attempt");
+      }
+
+      // 5c. DocId mapping file is sized by Lucene's numDocs at write time and 
remapped by the
+      //     segment's numDocs at read time - a mismatch here is what produced 
AIOOB in production.
+      File mappingFile = new File(segmentIndexDir,
+          column + 
V1Constants.Indexes.LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION);
+      Assert.assertTrue(mappingFile.exists(), "DocId mapping file should be 
built during conversion");
+      Assert.assertEquals(mappingFile.length(), (long) Integer.BYTES * 
numMutableDocs,
+          "DocId mapping file must be sized for the segment's numDocs only");
+
+      // 5d. End-to-end read
+      try (LuceneTextIndexReader reader = new LuceneTextIndexReader(column, 
segmentIndexDir,
+          numMutableDocs, config)) {
+        int[] matched = reader.getDocIds("doc").toArray();
+        for (int docId : matched) {
+          Assert.assertTrue(docId >= 0 && docId < numMutableDocs,
+              "Returned Pinot docId " + docId + " must be within [0, " + 
numMutableDocs + ")");
+        }
+      }
+    } finally {
+      FileUtils.deleteDirectory(scratch);
+    }
+  }
+
+  private static void writeLuceneDocs(File indexDir, int startDocId, int 
count, boolean multiSegment)
+      throws IOException {
+    try (Directory directory = FSDirectory.open(indexDir.toPath());
+        IndexWriter writer = new IndexWriter(directory,
+            new IndexWriterConfig(new StandardAnalyzer())
+                .setOpenMode(IndexWriterConfig.OpenMode.CREATE))) {
+      int splitAt = multiSegment ? Math.max(1, count / 2) : count;
+      for (int i = 0; i < splitAt; i++) {
+        writer.addDocument(buildDoc(startDocId + i));
+      }
+      if (multiSegment) {
+        writer.commit();
+        for (int i = splitAt; i < count; i++) {
+          writer.addDocument(buildDoc(startDocId + i));
+        }
+        writer.commit();
+        writer.forceMerge(1, true);
+      }
+      writer.commit();
+    }
+  }
+
+  private static Document buildDoc(int docId) {
+    Document doc = new Document();
+    doc.add(new TextField("foo", "doc " + docId, Field.Store.NO));
+    doc.add(new 
StoredField(LuceneTextIndexCreator.LUCENE_INDEX_DOC_ID_COLUMN_NAME, docId));
+    return doc;
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to