This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/paimon-vector-index.git
The following commit(s) were added to refs/heads/main by this push:
new dbf1d22 Reject non-zero v1 reserved bytes (#36)
dbf1d22 is described below
commit dbf1d222f73a134270ed41636a79cc8232703e66
Author: Jingsong Lee <[email protected]>
AuthorDate: Thu Jun 11 13:07:46 2026 +0800
Reject non-zero v1 reserved bytes (#36)
---
STORAGE_FORMAT.md | 10 +++++++---
core/src/index_io_util.rs | 10 ++++++++++
core/src/io.rs | 33 ++++++++++++++++++++++++++++++---
core/src/ivfflat_io.rs | 21 +++++++++++++++++++++
core/src/ivfhnswflat_io.rs | 27 +++++++++++++++++++++++++--
core/src/ivfhnswsq_io.rs | 29 +++++++++++++++++++++++++++--
6 files changed, 120 insertions(+), 10 deletions(-)
diff --git a/STORAGE_FORMAT.md b/STORAGE_FORMAT.md
index 2e1bf5f..d66699c 100644
--- a/STORAGE_FORMAT.md
+++ b/STORAGE_FORMAT.md
@@ -31,14 +31,18 @@ of the compatibility contract.
Because the fields are little-endian, the raw file bytes for those constants
appear in reverse ASCII order.
- Readers reject unknown magic values, unknown versions, unknown required
flags,
- invalid section sizes, negative counts, and malformed list payload metadata.
+ non-zero reserved bytes, invalid section sizes, negative counts, and
malformed
+ list payload metadata.
- Incompatible on-disk changes require a new format version. Version 1 readers
do not attempt to read future versions.
-- Reserved bytes are written as zero. Readers currently skip reserved bytes
- unless a format explicitly assigns them meaning in a later version.
+- Reserved bytes are written as zero and must be read back as zero. Future
+ extensions must use flags or a new format version rather than repurposing
+ non-zero reserved bytes within v1.
- Index files have no outer container, footer, checksum, compression envelope,
or schema registry. The complete file starts at byte offset 0 with one of the
headers below.
+- File integrity, including length and checksum validation, is guaranteed by
+ the outer Paimon file/manifest layer rather than by an embedded index footer.
- Roaring row-id filters are a query-time API payload. They are not embedded in
any index file format.
diff --git a/core/src/index_io_util.rs b/core/src/index_io_util.rs
index d788764..e5f2796 100644
--- a/core/src/index_io_util.rs
+++ b/core/src/index_io_util.rs
@@ -65,6 +65,16 @@ pub(crate) fn validate_search_inputs(
Ok(())
}
+pub(crate) fn validate_reserved_zero(bytes: &[u8], format_name: &str) ->
io::Result<()> {
+ if bytes.iter().any(|&byte| byte != 0) {
+ return Err(io::Error::new(
+ io::ErrorKind::InvalidData,
+ format!("{} reserved bytes must be zero", format_name),
+ ));
+ }
+ Ok(())
+}
+
const HNSW_GRAPH_MAGIC: u32 = 0x48574752; // "HWGR"
const HNSW_GRAPH_VERSION: u32 = 1;
const HNSW_GRAPH_FLAG_DELTA_VARINT: u32 = 1 << 0;
diff --git a/core/src/io.rs b/core/src/io.rs
index 67c7e24..7570c82 100644
--- a/core/src/io.rs
+++ b/core/src/io.rs
@@ -16,7 +16,9 @@
// under the License.
use crate::distance::MetricType;
-use crate::index_io_util::{decode_delta_varint_ids, encode_delta_varint_ids};
+use crate::index_io_util::{
+ decode_delta_varint_ids, encode_delta_varint_ids, validate_reserved_zero,
+};
use crate::ivfpq::IVFPQIndex;
use crate::opq::OPQMatrix;
use crate::pq::ProductQuantizer;
@@ -510,8 +512,9 @@ impl<R: SeekRead> IVFPQIndexReader<R> {
let total_vectors = read_i64_le(&mut cursor)?;
let flags = read_u32_le(&mut cursor)?;
- let mut skip = [0u8; 20];
- cursor.read_exact(&mut skip)?;
+ let mut reserved = [0u8; 20];
+ cursor.read_exact(&mut reserved)?;
+ validate_reserved_zero(&reserved, "IVFPQ")?;
let unknown_flags = flags & !SUPPORTED_FLAGS;
if unknown_flags != 0 {
return Err(io::Error::new(
@@ -1455,6 +1458,30 @@ mod tests {
assert!(err.to_string().contains("Unsupported IVFPQ flags"));
}
+ #[test]
+ fn test_nonzero_reserved_bytes_returns_error() {
+ let mut buf = Vec::new();
+ buf.extend_from_slice(&MAGIC.to_le_bytes());
+ buf.extend_from_slice(&VERSION.to_le_bytes());
+ buf.extend_from_slice(&4i32.to_le_bytes());
+ buf.extend_from_slice(&1i32.to_le_bytes());
+ buf.extend_from_slice(&1i32.to_le_bytes());
+ buf.extend_from_slice(&256i32.to_le_bytes());
+ buf.extend_from_slice(&4i32.to_le_bytes());
+ buf.extend_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+ buf.extend_from_slice(&0i64.to_le_bytes());
+ buf.extend_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+ buf.extend_from_slice(&[0u8; 20]);
+ buf[44] = 1;
+
+ let mut cursor = Cursor::new(&buf);
+ let err = match IVFPQIndexReader::open(&mut cursor) {
+ Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+ Err(err) => err,
+ };
+ assert!(err.to_string().contains("reserved bytes must be zero"));
+ }
+
#[test]
fn test_d_not_equal_m_times_dsub_returns_error() {
let mut buf = Vec::new();
diff --git a/core/src/ivfflat_io.rs b/core/src/ivfflat_io.rs
index 901329c..56d860f 100644
--- a/core/src/ivfflat_io.rs
+++ b/core/src/ivfflat_io.rs
@@ -16,6 +16,7 @@
// under the License.
use crate::distance::{fvec_distance, fvec_normalize, MetricType};
+use crate::index_io_util::validate_reserved_zero;
use crate::io::{PreadCursor, ReadRequest, SeekRead, SeekWrite};
use crate::ivfflat::IVFFlatIndex;
use crate::ivfpq::RowIdFilter;
@@ -193,6 +194,7 @@ impl<R: SeekRead> IVFFlatIndexReader<R> {
let flags = read_u32_le(&mut cursor)?;
let mut reserved = [0u8; 32];
cursor.read_exact(&mut reserved)?;
+ validate_reserved_zero(&reserved, "IVFFLAT")?;
let unknown_flags = flags & !SUPPORTED_FLAGS;
if unknown_flags != 0 {
return Err(io::Error::new(
@@ -1099,4 +1101,23 @@ mod tests {
};
assert!(err.to_string().contains("Unsupported IVFFLAT flags"));
}
+
+ #[test]
+ fn test_ivfflat_reader_rejects_nonzero_reserved_bytes() {
+ let mut buf = vec![0u8; IVFFLAT_HEADER_SIZE];
+ buf[0..4].copy_from_slice(&IVFFLAT_MAGIC.to_le_bytes());
+ buf[4..8].copy_from_slice(&IVFFLAT_VERSION.to_le_bytes());
+ buf[8..12].copy_from_slice(&2i32.to_le_bytes());
+ buf[12..16].copy_from_slice(&1i32.to_le_bytes());
+ buf[16..20].copy_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+ buf[20..28].copy_from_slice(&0i64.to_le_bytes());
+ buf[28..32].copy_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+ buf[32] = 1;
+
+ let err = match IVFFlatIndexReader::open(Cursor::new(buf)) {
+ Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+ Err(err) => err,
+ };
+ assert!(err.to_string().contains("reserved bytes must be zero"));
+ }
}
diff --git a/core/src/ivfhnswflat_io.rs b/core/src/ivfhnswflat_io.rs
index 0e949b2..f87e413 100644
--- a/core/src/ivfhnswflat_io.rs
+++ b/core/src/ivfhnswflat_io.rs
@@ -22,8 +22,8 @@ use crate::index_io_util::{
bytes_to_f32_vec, checked_list_bytes, checked_list_offset,
checked_section_size,
decode_delta_varint_ids, decode_graph, decode_roaring_filter,
encode_delta_varint_ids,
encode_graph, read_f32_vec, read_i32_le, read_i64_le, read_u32_le,
u64_to_i64, usize_to_i32,
- usize_to_i64, validate_positive_i32, validate_search_inputs,
write_f32_slice, write_i32_le,
- write_i64_le, write_u32_le,
+ usize_to_i64, validate_positive_i32, validate_reserved_zero,
validate_search_inputs,
+ write_f32_slice, write_i32_le, write_i64_le, write_u32_le,
};
use crate::io::{PreadCursor, ReadRequest, SeekRead, SeekWrite};
use crate::ivfhnswflat::IVFHNSWFlatIndex;
@@ -200,6 +200,7 @@ impl<R: SeekRead> IVFHNSWFlatIndexReader<R> {
let flags = read_u32_le(&mut cursor)?;
let mut reserved = [0u8; 20];
cursor.read_exact(&mut reserved)?;
+ validate_reserved_zero(&reserved, "IVF_HNSW_FLAT")?;
let unknown_flags = flags & !SUPPORTED_FLAGS;
if unknown_flags != 0 {
return Err(io::Error::new(
@@ -1128,6 +1129,28 @@ mod tests {
assert!(err.to_string().contains("Unsupported IVF_HNSW_FLAT flags"));
}
+ #[test]
+ fn test_ivfhnswflat_reader_rejects_nonzero_reserved_bytes() {
+ let mut buf = vec![0u8; IVF_HNSW_FLAT_HEADER_SIZE];
+ buf[0..4].copy_from_slice(&IVF_HNSW_FLAT_MAGIC.to_le_bytes());
+ buf[4..8].copy_from_slice(&IVF_HNSW_FLAT_VERSION.to_le_bytes());
+ buf[8..12].copy_from_slice(&2i32.to_le_bytes());
+ buf[12..16].copy_from_slice(&1i32.to_le_bytes());
+ buf[16..20].copy_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+ buf[20..28].copy_from_slice(&0i64.to_le_bytes());
+ buf[28..32].copy_from_slice(&2i32.to_le_bytes());
+ buf[32..36].copy_from_slice(&8i32.to_le_bytes());
+ buf[36..40].copy_from_slice(&3i32.to_le_bytes());
+ buf[40..44].copy_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+ buf[44] = 1;
+
+ let err = match IVFHNSWFlatIndexReader::open(Cursor::new(buf)) {
+ Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+ Err(err) => err,
+ };
+ assert!(err.to_string().contains("reserved bytes must be zero"));
+ }
+
#[test]
fn test_ivfhnswflat_write_read_search_roundtrip() {
let d = 4;
diff --git a/core/src/ivfhnswsq_io.rs b/core/src/ivfhnswsq_io.rs
index c6c57e2..d5ea2dc 100644
--- a/core/src/ivfhnswsq_io.rs
+++ b/core/src/ivfhnswsq_io.rs
@@ -22,8 +22,8 @@ use crate::index_io_util::{
checked_list_bytes, checked_list_offset, checked_section_size,
decode_delta_varint_ids,
decode_graph, decode_roaring_filter, encode_delta_varint_ids,
encode_graph, read_f32_vec,
read_i32_le, read_i64_le, read_u32_le, u64_to_i64, usize_to_i32,
usize_to_i64,
- validate_positive_i32, validate_search_inputs, write_f32_slice,
write_i32_le, write_i64_le,
- write_u32_le,
+ validate_positive_i32, validate_reserved_zero, validate_search_inputs,
write_f32_slice,
+ write_i32_le, write_i64_le, write_u32_le,
};
use crate::io::{PreadCursor, ReadRequest, SeekRead, SeekWrite};
use crate::ivfhnswsq::IVFHNSWSQIndex;
@@ -209,6 +209,7 @@ impl<R: SeekRead> IVFHNSWSQIndexReader<R> {
let flags = read_u32_le(&mut cursor)?;
let mut reserved = [0u8; 12];
cursor.read_exact(&mut reserved)?;
+ validate_reserved_zero(&reserved, "IVF_HNSW_SQ")?;
let unknown_flags = flags & !SUPPORTED_FLAGS;
if unknown_flags != 0 {
return Err(io::Error::new(
@@ -1243,6 +1244,30 @@ mod tests {
assert!(err.to_string().contains("Unsupported IVF_HNSW_SQ flags"));
}
+ #[test]
+ fn test_ivfhnswsq_reader_rejects_nonzero_reserved_bytes() {
+ let mut buf = vec![0u8; IVF_HNSW_SQ_HEADER_SIZE + 16];
+ buf[0..4].copy_from_slice(&IVF_HNSW_SQ_MAGIC.to_le_bytes());
+ buf[4..8].copy_from_slice(&IVF_HNSW_SQ_VERSION.to_le_bytes());
+ buf[8..12].copy_from_slice(&2i32.to_le_bytes());
+ buf[12..16].copy_from_slice(&1i32.to_le_bytes());
+ buf[16..20].copy_from_slice(&(MetricType::L2 as u32).to_le_bytes());
+ buf[20..28].copy_from_slice(&0i64.to_le_bytes());
+ buf[28..32].copy_from_slice(&2i32.to_le_bytes());
+ buf[32..36].copy_from_slice(&8i32.to_le_bytes());
+ buf[36..40].copy_from_slice(&3i32.to_le_bytes());
+ buf[40..44].copy_from_slice(&0.0f32.to_le_bytes());
+ buf[44..48].copy_from_slice(&0.0f32.to_le_bytes());
+ buf[48..52].copy_from_slice(&REQUIRED_FLAGS.to_le_bytes());
+ buf[52] = 1;
+
+ let err = match IVFHNSWSQIndexReader::open(Cursor::new(buf)) {
+ Ok(_) => panic!("non-zero reserved bytes should be rejected"),
+ Err(err) => err,
+ };
+ assert!(err.to_string().contains("reserved bytes must be zero"));
+ }
+
#[test]
fn test_ivfhnswsq_write_read_search_roundtrip() {
let d = 4;