This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/paimon-vector-index.git


The following commit(s) were added to refs/heads/main by this push:
     new dbf1d22  Reject non-zero v1 reserved bytes (#36)
dbf1d22 is described below

commit dbf1d222f73a134270ed41636a79cc8232703e66
Author: Jingsong Lee <[email protected]>
AuthorDate: Thu Jun 11 13:07:46 2026 +0800

    Reject non-zero v1 reserved bytes (#36)
---
 STORAGE_FORMAT.md          | 10 +++++++---
 core/src/index_io_util.rs  | 10 ++++++++++
 core/src/io.rs             | 33 ++++++++++++++++++++++++++++++---
 core/src/ivfflat_io.rs     | 21 +++++++++++++++++++++
 core/src/ivfhnswflat_io.rs | 27 +++++++++++++++++++++++++--
 core/src/ivfhnswsq_io.rs   | 29 +++++++++++++++++++++++++++--
 6 files changed, 120 insertions(+), 10 deletions(-)

diff --git a/STORAGE_FORMAT.md b/STORAGE_FORMAT.md
index 2e1bf5f..d66699c 100644
--- a/STORAGE_FORMAT.md
+++ b/STORAGE_FORMAT.md
@@ -31,14 +31,18 @@ of the compatibility contract.
   Because the fields are little-endian, the raw file bytes for those constants
   appear in reverse ASCII order.
 - Readers reject unknown magic values, unknown versions, unknown required 
flags,
-  invalid section sizes, negative counts, and malformed list payload metadata.
+  non-zero reserved bytes, invalid section sizes, negative counts, and 
malformed
+  list payload metadata.
 - Incompatible on-disk changes require a new format version. Version 1 readers
   do not attempt to read future versions.
-- Reserved bytes are written as zero. Readers currently skip reserved bytes
-  unless a format explicitly assigns them meaning in a later version.
+- Reserved bytes are written as zero and must be read back as zero. Future
+  extensions must use flags or a new format version rather than repurposing
+  non-zero reserved bytes within v1.
 - Index files have no outer container, footer, checksum, compression envelope,
   or schema registry. The complete file starts at byte offset 0 with one of the
   headers below.
+- File integrity, including length and checksum validation, is guaranteed by
+  the outer Paimon file/manifest layer rather than by an embedded index footer.
 - Roaring row-id filters are a query-time API payload. They are not embedded in
   any index file format.
 
diff --git a/core/src/index_io_util.rs b/core/src/index_io_util.rs
index d788764..e5f2796 100644
--- a/core/src/index_io_util.rs
+++ b/core/src/index_io_util.rs
@@ -65,6 +65,16 @@ pub(crate) fn validate_search_inputs(
     Ok(())
 }
 
+pub(crate) fn validate_reserved_zero(bytes: &[u8], format_name: &str) -> 
io::Result<()> {
+    if bytes.iter().any(|&byte| byte != 0) {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            format!("{} reserved bytes must be zero", format_name),
+        ));
+    }
+    Ok(())
+}
+
 const HNSW_GRAPH_MAGIC: u32 = 0x48574752; // "HWGR"
 const HNSW_GRAPH_VERSION: u32 = 1;
 const HNSW_GRAPH_FLAG_DELTA_VARINT: u32 = 1 << 0;
diff --git a/core/src/io.rs b/core/src/io.rs
index 67c7e24..7570c82 100644
--- a/core/src/io.rs
+++ b/core/src/io.rs
@@ -16,7 +16,9 @@
 // under the License.
 
 use crate::distance::MetricType;
-use crate::index_io_util::{decode_delta_varint_ids, encode_delta_varint_ids};
+use crate::index_io_util::{
+    decode_delta_varint_ids, encode_delta_varint_ids, validate_reserved_zero,
+};
 use crate::ivfpq::IVFPQIndex;
 use crate::opq::OPQMatrix;
 use crate::pq::ProductQuantizer;
@@ -510,8 +512,9 @@ impl<R: SeekRead> IVFPQIndexReader<R> {
         let total_vectors = read_i64_le(&mut cursor)?;
 
         let flags = read_u32_le(&mut cursor)?;
-        let mut skip = [0u8; 20];
-        cursor.read_exact(&mut skip)?;
+        let mut reserved = [0u8; 20];
+        cursor.read_exact(&mut reserved)?;
+        validate_reserved_zero(&reserved, "IVFPQ")?;
         let unknown_flags = flags & !SUPPORTED_FLAGS;
         if unknown_flags != 0 {
             return Err(io::Error::new(
@@ -1455,6 +1458,30 @@ mod tests {
         assert!(err.to_string().contains("Unsupported IVFPQ flags"));
     }
 
+    #[test]
+    fn test_nonzero_reserved_bytes_returns_error() {
+        let mut buf = Vec::new();
+        buf.extend_from_slice(&MAGIC.to_le_bytes());
+        buf.extend_from_slice(&VERSION.to_le_bytes());
+        buf.extend_from_slice(&4i32.to_le_bytes());
+        buf.extend_from_slice(&1i32.to_le_bytes());
+        buf.extend_from_slice(&1i32.to_le_bytes());
+        buf.extend_from_slice(&256i32.to_le_bytes());
+        buf.extend_from_slice(&4i32.to_le_bytes());
+        buf.extend_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+        buf.extend_from_slice(&0i64.to_le_bytes());
+        buf.extend_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+        buf.extend_from_slice(&[0u8; 20]);
+        buf[44] = 1;
+
+        let mut cursor = Cursor::new(&buf);
+        let err = match IVFPQIndexReader::open(&mut cursor) {
+            Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+            Err(err) => err,
+        };
+        assert!(err.to_string().contains("reserved bytes must be zero"));
+    }
+
     #[test]
     fn test_d_not_equal_m_times_dsub_returns_error() {
         let mut buf = Vec::new();
diff --git a/core/src/ivfflat_io.rs b/core/src/ivfflat_io.rs
index 901329c..56d860f 100644
--- a/core/src/ivfflat_io.rs
+++ b/core/src/ivfflat_io.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 use crate::distance::{fvec_distance, fvec_normalize, MetricType};
+use crate::index_io_util::validate_reserved_zero;
 use crate::io::{PreadCursor, ReadRequest, SeekRead, SeekWrite};
 use crate::ivfflat::IVFFlatIndex;
 use crate::ivfpq::RowIdFilter;
@@ -193,6 +194,7 @@ impl<R: SeekRead> IVFFlatIndexReader<R> {
         let flags = read_u32_le(&mut cursor)?;
         let mut reserved = [0u8; 32];
         cursor.read_exact(&mut reserved)?;
+        validate_reserved_zero(&reserved, "IVFFLAT")?;
         let unknown_flags = flags & !SUPPORTED_FLAGS;
         if unknown_flags != 0 {
             return Err(io::Error::new(
@@ -1099,4 +1101,23 @@ mod tests {
         };
         assert!(err.to_string().contains("Unsupported IVFFLAT flags"));
     }
+
+    #[test]
+    fn test_ivfflat_reader_rejects_nonzero_reserved_bytes() {
+        let mut buf = vec![0u8; IVFFLAT_HEADER_SIZE];
+        buf[0..4].copy_from_slice(&IVFFLAT_MAGIC.to_le_bytes());
+        buf[4..8].copy_from_slice(&IVFFLAT_VERSION.to_le_bytes());
+        buf[8..12].copy_from_slice(&2i32.to_le_bytes());
+        buf[12..16].copy_from_slice(&1i32.to_le_bytes());
+        buf[16..20].copy_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+        buf[20..28].copy_from_slice(&0i64.to_le_bytes());
+        buf[28..32].copy_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+        buf[32] = 1;
+
+        let err = match IVFFlatIndexReader::open(Cursor::new(buf)) {
+            Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+            Err(err) => err,
+        };
+        assert!(err.to_string().contains("reserved bytes must be zero"));
+    }
 }
diff --git a/core/src/ivfhnswflat_io.rs b/core/src/ivfhnswflat_io.rs
index 0e949b2..f87e413 100644
--- a/core/src/ivfhnswflat_io.rs
+++ b/core/src/ivfhnswflat_io.rs
@@ -22,8 +22,8 @@ use crate::index_io_util::{
     bytes_to_f32_vec, checked_list_bytes, checked_list_offset, 
checked_section_size,
     decode_delta_varint_ids, decode_graph, decode_roaring_filter, 
encode_delta_varint_ids,
     encode_graph, read_f32_vec, read_i32_le, read_i64_le, read_u32_le, 
u64_to_i64, usize_to_i32,
-    usize_to_i64, validate_positive_i32, validate_search_inputs, 
write_f32_slice, write_i32_le,
-    write_i64_le, write_u32_le,
+    usize_to_i64, validate_positive_i32, validate_reserved_zero, 
validate_search_inputs,
+    write_f32_slice, write_i32_le, write_i64_le, write_u32_le,
 };
 use crate::io::{PreadCursor, ReadRequest, SeekRead, SeekWrite};
 use crate::ivfhnswflat::IVFHNSWFlatIndex;
@@ -200,6 +200,7 @@ impl<R: SeekRead> IVFHNSWFlatIndexReader<R> {
         let flags = read_u32_le(&mut cursor)?;
         let mut reserved = [0u8; 20];
         cursor.read_exact(&mut reserved)?;
+        validate_reserved_zero(&reserved, "IVF_HNSW_FLAT")?;
         let unknown_flags = flags & !SUPPORTED_FLAGS;
         if unknown_flags != 0 {
             return Err(io::Error::new(
@@ -1128,6 +1129,28 @@ mod tests {
         assert!(err.to_string().contains("Unsupported IVF_HNSW_FLAT flags"));
     }
 
+    #[test]
+    fn test_ivfhnswflat_reader_rejects_nonzero_reserved_bytes() {
+        let mut buf = vec![0u8; IVF_HNSW_FLAT_HEADER_SIZE];
+        buf[0..4].copy_from_slice(&IVF_HNSW_FLAT_MAGIC.to_le_bytes());
+        buf[4..8].copy_from_slice(&IVF_HNSW_FLAT_VERSION.to_le_bytes());
+        buf[8..12].copy_from_slice(&2i32.to_le_bytes());
+        buf[12..16].copy_from_slice(&1i32.to_le_bytes());
+        buf[16..20].copy_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+        buf[20..28].copy_from_slice(&0i64.to_le_bytes());
+        buf[28..32].copy_from_slice(&2i32.to_le_bytes());
+        buf[32..36].copy_from_slice(&8i32.to_le_bytes());
+        buf[36..40].copy_from_slice(&3i32.to_le_bytes());
+        buf[40..44].copy_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+        buf[44] = 1;
+
+        let err = match IVFHNSWFlatIndexReader::open(Cursor::new(buf)) {
+            Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+            Err(err) => err,
+        };
+        assert!(err.to_string().contains("reserved bytes must be zero"));
+    }
+
     #[test]
     fn test_ivfhnswflat_write_read_search_roundtrip() {
         let d = 4;
diff --git a/core/src/ivfhnswsq_io.rs b/core/src/ivfhnswsq_io.rs
index c6c57e2..d5ea2dc 100644
--- a/core/src/ivfhnswsq_io.rs
+++ b/core/src/ivfhnswsq_io.rs
@@ -22,8 +22,8 @@ use crate::index_io_util::{
     checked_list_bytes, checked_list_offset, checked_section_size, 
decode_delta_varint_ids,
     decode_graph, decode_roaring_filter, encode_delta_varint_ids, 
encode_graph, read_f32_vec,
     read_i32_le, read_i64_le, read_u32_le, u64_to_i64, usize_to_i32, 
usize_to_i64,
-    validate_positive_i32, validate_search_inputs, write_f32_slice, 
write_i32_le, write_i64_le,
-    write_u32_le,
+    validate_positive_i32, validate_reserved_zero, validate_search_inputs, 
write_f32_slice,
+    write_i32_le, write_i64_le, write_u32_le,
 };
 use crate::io::{PreadCursor, ReadRequest, SeekRead, SeekWrite};
 use crate::ivfhnswsq::IVFHNSWSQIndex;
@@ -209,6 +209,7 @@ impl<R: SeekRead> IVFHNSWSQIndexReader<R> {
         let flags = read_u32_le(&mut cursor)?;
         let mut reserved = [0u8; 12];
         cursor.read_exact(&mut reserved)?;
+        validate_reserved_zero(&reserved, "IVF_HNSW_SQ")?;
         let unknown_flags = flags & !SUPPORTED_FLAGS;
         if unknown_flags != 0 {
             return Err(io::Error::new(
@@ -1243,6 +1244,30 @@ mod tests {
         assert!(err.to_string().contains("Unsupported IVF_HNSW_SQ flags"));
     }
 
+    #[test]
+    fn test_ivfhnswsq_reader_rejects_nonzero_reserved_bytes() {
+        let mut buf = vec![0u8; IVF_HNSW_SQ_HEADER_SIZE + 16];
+        buf[0..4].copy_from_slice(&IVF_HNSW_SQ_MAGIC.to_le_bytes());
+        buf[4..8].copy_from_slice(&IVF_HNSW_SQ_VERSION.to_le_bytes());
+        buf[8..12].copy_from_slice(&2i32.to_le_bytes());
+        buf[12..16].copy_from_slice(&1i32.to_le_bytes());
+        buf[16..20].copy_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+        buf[20..28].copy_from_slice(&0i64.to_le_bytes());
+        buf[28..32].copy_from_slice(&2i32.to_le_bytes());
+        buf[32..36].copy_from_slice(&8i32.to_le_bytes());
+        buf[36..40].copy_from_slice(&3i32.to_le_bytes());
+        buf[40..44].copy_from_slice(&0.0f32.to_le_bytes());
+        buf[44..48].copy_from_slice(&0.0f32.to_le_bytes());
+        buf[48..52].copy_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+        buf[52] = 1;
+
+        let err = match IVFHNSWSQIndexReader::open(Cursor::new(buf)) {
+            Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+            Err(err) => err,
+        };
+        assert!(err.to_string().contains("reserved bytes must be zero"));
+    }
+
     #[test]
     fn test_ivfhnswsq_write_read_search_roundtrip() {
         let d = 4;

Reply via email to