This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/paimon-vector-index.git
The following commit(s) were added to refs/heads/main by this push:
new f72c9b0 Guard IVFPQ index merge compatibility (#17)
f72c9b0 is described below
commit f72c9b02ea4aa4533c46a153a1309a0666b01545
Author: QuakeWang <[email protected]>
AuthorDate: Tue Jun 9 12:01:21 2026 +0800
Guard IVFPQ index merge compatibility (#17)
---
core/src/ivfpq.rs | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 154 insertions(+), 8 deletions(-)
diff --git a/core/src/ivfpq.rs b/core/src/ivfpq.rs
index 61c5363..1684481 100644
--- a/core/src/ivfpq.rs
+++ b/core/src/ivfpq.rs
@@ -653,13 +653,11 @@ impl IVFPQIndex {
}
/// Merge another index's inverted lists into this one.
- /// Both indexes must have the same centroids and codebooks (trained from
the same data).
+ /// Both indexes must have identical training state: metric, residual mode,
+ /// OPQ rotation, coarse centroids, and PQ codebooks.
/// Used for compaction: merging multiple small index files into one.
- pub fn merge_from(&mut self, other: &IVFPQIndex) {
- assert_eq!(self.d, other.d, "Dimension mismatch");
- assert_eq!(self.nlist, other.nlist, "nlist mismatch");
- assert_eq!(self.pq.m, other.pq.m, "PQ M mismatch");
- assert_eq!(self.pq.nbits, other.pq.nbits, "PQ nbits mismatch");
+ pub fn merge_from(&mut self, other: &IVFPQIndex) -> io::Result<()> {
+ self.ensure_merge_compatible(other)?;
for list_id in 0..self.nlist {
self.ids[list_id].extend_from_slice(&other.ids[list_id]);
@@ -669,7 +667,81 @@ impl IVFPQIndex {
// Invalidate precomputed structures (need to rebuild after merge)
self.fastscan_codes.clear();
self.precomputed_table.clear();
+ Ok(())
}
+
+ fn ensure_merge_compatible(&self, other: &IVFPQIndex) -> io::Result<()> {
+ if self.d != other.d {
+ return Err(invalid_merge_input(format!(
+ "dimension mismatch: self={}, other={}",
+ self.d, other.d
+ )));
+ }
+ if self.nlist != other.nlist {
+ return Err(invalid_merge_input(format!(
+ "nlist mismatch: self={}, other={}",
+ self.nlist, other.nlist
+ )));
+ }
+ if self.metric != other.metric {
+ return Err(invalid_merge_input(format!(
+ "metric mismatch: self={:?}, other={:?}",
+ self.metric, other.metric
+ )));
+ }
+ if self.by_residual != other.by_residual {
+ return Err(invalid_merge_input(format!(
+ "residual mode mismatch: self={}, other={}",
+ self.by_residual, other.by_residual
+ )));
+ }
+ if self.pq.d != other.pq.d
+ || self.pq.m != other.pq.m
+ || self.pq.nbits != other.pq.nbits
+ || self.pq.dsub != other.pq.dsub
+ || self.pq.ksub != other.pq.ksub
+ {
+ return Err(invalid_merge_input(format!(
+ "PQ layout mismatch: self=(d={}, m={}, nbits={}, dsub={},
ksub={}), other=(d={}, m={}, nbits={}, dsub={}, ksub={})",
+ self.pq.d,
+ self.pq.m,
+ self.pq.nbits,
+ self.pq.dsub,
+ self.pq.ksub,
+ other.pq.d,
+ other.pq.m,
+ other.pq.nbits,
+ other.pq.dsub,
+ other.pq.ksub
+ )));
+ }
+ if self.opq.is_some() != other.opq.is_some() {
+ return Err(invalid_merge_input("OPQ configuration mismatch"));
+ }
+ if let (Some(self_opq), Some(other_opq)) = (&self.opq, &other.opq) {
+ if self_opq.d != other_opq.d || self_opq.m != other_opq.m {
+ return Err(invalid_merge_input(format!(
+ "OPQ layout mismatch: self=(d={}, m={}), other=(d={},
m={})",
+ self_opq.d, self_opq.m, other_opq.d, other_opq.m
+ )));
+ }
+ if self_opq.rotation != other_opq.rotation {
+ return Err(invalid_merge_input("OPQ rotation mismatch"));
+ }
+ }
+ if self.quantizer_centroids != other.quantizer_centroids {
+ return Err(invalid_merge_input("coarse centroids mismatch"));
+ }
+ if self.pq.centroids != other.pq.centroids {
+ return Err(invalid_merge_input("PQ codebooks mismatch"));
+ }
+
+ Ok(())
+ }
+}
+
+fn invalid_merge_input(message: impl Into<String>) -> io::Error {
+ io::Error::new(io::ErrorKind::InvalidInput, message.into())
}
/// Scan 4-bit packed codes using u8-domain accumulation.
@@ -1483,6 +1555,24 @@ mod tests {
data
}
+ fn assert_invalid_merge(base: &IVFPQIndex, other: &IVFPQIndex,
expected_message: &str) {
+ let mut target = IVFPQIndex::from_trained(base);
+ let before_ids = target.ids.clone();
+ let before_codes = target.codes.clone();
+
+ let err = target.merge_from(other).unwrap_err();
+
+ assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
+ assert!(
+ err.to_string().contains(expected_message),
+ "merge error `{}` does not contain `{}`",
+ err,
+ expected_message
+ );
+ assert_eq!(target.ids, before_ids);
+ assert_eq!(target.codes, before_codes);
+ }
+
#[test]
fn test_build_and_search_l2() {
let d = 16;
@@ -1686,8 +1776,8 @@ mod tests {
assert_eq!(total_a + total_b, n * 2);
let mut merged = IVFPQIndex::from_trained(&trainer);
- merged.merge_from(&worker_a);
- merged.merge_from(&worker_b);
+ merged.merge_from(&worker_a).unwrap();
+ merged.merge_from(&worker_b).unwrap();
let total_merged: usize = merged.ids.iter().map(|l| l.len()).sum();
assert_eq!(total_merged, n * 2);
@@ -1701,6 +1791,62 @@ mod tests {
assert_eq!(labels[0], n as i64);
}
+ #[test]
+ fn test_merge_rejects_incompatible_training_state() {
+ let d = 16;
+ let nlist = 4;
+ let m = 4;
+ let n = 500;
+
+ let data = generate_clustered_data(n, d, 4, 42);
+ let ids: Vec<i64> = (0..n as i64).collect();
+
+ let mut trainer = IVFPQIndex::new(d, nlist, m, MetricType::L2, false);
+ trainer.train(&data, n);
+
+ let mut base = IVFPQIndex::from_trained(&trainer);
+ base.add(&data, &ids, n);
+
+ let mut mismatched_metric = IVFPQIndex::from_trained(&trainer);
+ mismatched_metric.metric = MetricType::InnerProduct;
+ mismatched_metric.by_residual = false;
+ assert_invalid_merge(&base, &mismatched_metric, "metric mismatch");
+
+ let mut mismatched_residual = IVFPQIndex::from_trained(&trainer);
+ mismatched_residual.by_residual = false;
+ assert_invalid_merge(&base, &mismatched_residual, "residual mode
mismatch");
+
+ let mut mismatched_centroids = IVFPQIndex::from_trained(&trainer);
+ mismatched_centroids.quantizer_centroids[0] += 1.0;
+ assert_invalid_merge(&base, &mismatched_centroids, "coarse centroids
mismatch");
+
+ let mut mismatched_codebooks = IVFPQIndex::from_trained(&trainer);
+ mismatched_codebooks.pq.centroids[0] += 1.0;
+ assert_invalid_merge(&base, &mismatched_codebooks, "PQ codebooks
mismatch");
+
+ let mismatched_opq = IVFPQIndex::new(d, nlist, m, MetricType::L2,
true);
+ assert_invalid_merge(&base, &mismatched_opq, "OPQ configuration
mismatch");
+ }
+
+ #[test]
+ fn test_merge_rejects_incompatible_opq_rotation() {
+ let d = 16;
+ let nlist = 4;
+ let m = 4;
+ let n = 500;
+
+ let data = generate_clustered_data(n, d, 4, 55);
+
+ let mut trainer = IVFPQIndex::new(d, nlist, m, MetricType::L2, true);
+ trainer.train(&data, n);
+
+ let base = IVFPQIndex::from_trained(&trainer);
+ let mut mismatched_rotation = IVFPQIndex::from_trained(&trainer);
+ mismatched_rotation.opq.as_mut().unwrap().rotation[0] += 1.0;
+
+ assert_invalid_merge(&base, &mismatched_rotation, "OPQ rotation
mismatch");
+ }
+
#[test]
fn test_opq_ip() {
let d = 16;