This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/paimon-vector-index.git


The following commit(s) were added to refs/heads/main by this push:
     new 9299ff7  Add ANN benchmark for vector indexes (#25)
9299ff7 is described below

commit 9299ff7b65ba5b4b375af5474d9db05dfd72c6bf
Author: Jingsong Lee <[email protected]>
AuthorDate: Tue Jun 9 23:43:49 2026 +0800

    Add ANN benchmark for vector indexes (#25)
---
 README.md                 |  21 +++
 core/Cargo.toml           |   4 +
 core/benches/ann_bench.rs | 449 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 474 insertions(+)

diff --git a/README.md b/README.md
index bf9ad97..66e93d9 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,27 @@ Bindings expose the same wire format:
 
 Row IDs must be non-negative to map directly into `RoaringTreemap`'s `u64` 
domain.
 
+## ANN Benchmark
+
+The core crate includes an ANN-style benchmark for comparing Paimon's IVF-PQ,
+IVF-HNSW-FLAT, and IVF-HNSW-SQ indexes. It reports build time, reader open/load
+time, first-query latency, batch query throughput, and serialized index size:
+
+```bash
+cargo bench -p paimon-vindex-core --bench ann_bench -- --nocapture
+```
+
+The benchmark is configured with environment variables:
+
+```bash
+ANN_N=100000 ANN_NQ=1000 ANN_D=128 ANN_K=10 ANN_NLIST=256 ANN_NPROBE=16 \
+ANN_PQ_M=16 ANN_HNSW_EF_CONSTRUCTION=150 ANN_HNSW_EF_SEARCH=80 \
+cargo bench -p paimon-vindex-core --bench ann_bench -- --nocapture
+```
+
+Benchmark rows report `disk_scope=index_bytes`, which is the serialized vector
+index file.
+
 ## Unified API
 
 Rust, Java, and Python expose one writer and one reader API. Writers are 
created
diff --git a/core/Cargo.toml b/core/Cargo.toml
index af0d65e..f331ef7 100644
--- a/core/Cargo.toml
+++ b/core/Cargo.toml
@@ -38,3 +38,7 @@ harness = false
 [[bench]]
 name = "recall_bench"
 harness = false
+
+[[bench]]
+name = "ann_bench"
+harness = false
diff --git a/core/benches/ann_bench.rs b/core/benches/ann_bench.rs
new file mode 100644
index 0000000..f8f3f73
--- /dev/null
+++ b/core/benches/ann_bench.rs
@@ -0,0 +1,449 @@
+use paimon_vindex_core::distance::MetricType;
+use paimon_vindex_core::hnsw::HnswBuildParams;
+use paimon_vindex_core::io::{write_index, IVFPQIndexReader, PosWriter};
+use paimon_vindex_core::ivfhnswflat::IVFHNSWFlatIndex;
+use paimon_vindex_core::ivfhnswflat_io::{
+    search_batch_ivfhnswflat_reader, write_ivfhnswflat_index, 
IVFHNSWFlatIndexReader,
+};
+use paimon_vindex_core::ivfhnswsq::IVFHNSWSQIndex;
+use paimon_vindex_core::ivfhnswsq_io::{
+    search_batch_ivfhnswsq_reader, write_ivfhnswsq_index, IVFHNSWSQIndexReader,
+};
+use paimon_vindex_core::ivfpq::{search_batch_reader, IVFPQIndex};
+use std::env;
+use std::fs;
+use std::io;
+use std::path::{Path, PathBuf};
+use std::time::{Duration, Instant};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let cfg = Config::from_env()?;
+    let dataset = Dataset::clustered(cfg.n, cfg.nq, cfg.d, cfg.clusters, 
cfg.seed);
+    let ids: Vec<i64> = (0..cfg.n as i64).collect();
+    let workspace = prepare_workspace(&cfg.output_dir)?;
+
+    println!("{}", CsvRow::header());
+    run_paimon_ivfpq(&cfg, &dataset, &ids, &workspace)?;
+    run_paimon_ivfhnswflat(&cfg, &dataset, &ids, &workspace)?;
+    run_paimon_ivfhnswsq(&cfg, &dataset, &ids, &workspace)?;
+
+    Ok(())
+}
+
+#[derive(Clone)]
+struct Config {
+    n: usize,
+    nq: usize,
+    d: usize,
+    k: usize,
+    nlist: usize,
+    nprobe: usize,
+    pq_m: usize,
+    hnsw_m: usize,
+    hnsw_ef_construction: usize,
+    hnsw_ef_search: usize,
+    clusters: usize,
+    seed: u64,
+    output_dir: PathBuf,
+}
+
+impl Config {
+    fn from_env() -> Result<Self, Box<dyn std::error::Error>> {
+        let n = read_env("ANN_N", 10_000)?;
+        let nq = read_env("ANN_NQ", 100)?;
+        let d = read_env("ANN_D", 128)?;
+        let k = read_env("ANN_K", 10)?;
+        let nlist = read_env("ANN_NLIST", 64)?;
+        let nprobe = read_env("ANN_NPROBE", 8)?;
+        let pq_m = read_env("ANN_PQ_M", 16)?;
+        let hnsw_m = read_env("ANN_HNSW_M", 20)?;
+        let hnsw_ef_construction = read_env("ANN_HNSW_EF_CONSTRUCTION", 150)?;
+        let hnsw_ef_search = read_env("ANN_HNSW_EF_SEARCH", 80)?;
+        let clusters = read_env("ANN_CLUSTERS", 32)?;
+        let seed = read_env("ANN_SEED", 42)?;
+        let output_dir = env::var("ANN_OUTPUT_DIR")
+            .map(PathBuf::from)
+            .unwrap_or_else(|_| env::temp_dir().join("paimon-ann-bench"));
+
+        if n == 0 || nq == 0 || d == 0 || k == 0 || nlist == 0 || nprobe == 0 {
+            return Err(
+                "ANN_N, ANN_NQ, ANN_D, ANN_K, ANN_NLIST, and ANN_NPROBE must 
be > 0".into(),
+            );
+        }
+        if nlist > n {
+            return Err(format!("ANN_NLIST ({}) must be <= ANN_N ({})", nlist, 
n).into());
+        }
+        if nprobe > nlist {
+            return Err(format!("ANN_NPROBE ({}) must be <= ANN_NLIST ({})", 
nprobe, nlist).into());
+        }
+        if d % pq_m != 0 {
+            return Err(format!("ANN_D ({}) must be divisible by ANN_PQ_M 
({})", d, pq_m).into());
+        }
+
+        Ok(Self {
+            n,
+            nq,
+            d,
+            k,
+            nlist,
+            nprobe,
+            pq_m,
+            hnsw_m,
+            hnsw_ef_construction,
+            hnsw_ef_search,
+            clusters,
+            seed,
+            output_dir,
+        })
+    }
+
+    fn hnsw_params(&self) -> HnswBuildParams {
+        HnswBuildParams {
+            m: self.hnsw_m,
+            ef_construction: self.hnsw_ef_construction,
+            ..HnswBuildParams::default()
+        }
+        .sanitized()
+    }
+}
+
+struct Dataset {
+    data: Vec<f32>,
+    queries: Vec<f32>,
+}
+
+impl Dataset {
+    fn clustered(n: usize, nq: usize, d: usize, clusters: usize, seed: u64) -> 
Self {
+        let mut rng = Lcg::new(seed);
+        let mut centers = vec![0.0f32; clusters * d];
+        for value in &mut centers {
+            *value = rng.next_f32() * 30.0;
+        }
+
+        let mut data = vec![0.0f32; n * d];
+        for i in 0..n {
+            let cluster = i % clusters;
+            for j in 0..d {
+                data[i * d + j] = centers[cluster * d + j] + rng.next_f32();
+            }
+        }
+
+        let mut queries = vec![0.0f32; nq * d];
+        for qi in 0..nq {
+            let source = (qi * 9973) % n;
+            queries[qi * d..(qi + 1) * d].copy_from_slice(&data[source * 
d..(source + 1) * d]);
+        }
+
+        Self { data, queries }
+    }
+}
+
+struct Lcg {
+    state: u64,
+}
+
+impl Lcg {
+    fn new(seed: u64) -> Self {
+        Self { state: seed }
+    }
+
+    fn next_f32(&mut self) -> f32 {
+        self.state = 
self.state.wrapping_mul(6364136223846793005).wrapping_add(1);
+        ((self.state >> 33) as f32) / (u32::MAX as f32) * 2.0 - 1.0
+    }
+}
+
+struct CsvRow {
+    engine: &'static str,
+    index: &'static str,
+    n: usize,
+    nq: usize,
+    d: usize,
+    k: usize,
+    nlist: usize,
+    nprobe: usize,
+    ef_search: Option<usize>,
+    build_ms: u128,
+    read_ms: u128,
+    first_query_ms: u128,
+    search_ms: u128,
+    qps: f64,
+    disk_bytes: u64,
+    disk_scope: &'static str,
+    note: &'static str,
+}
+
+impl CsvRow {
+    fn header() -> &'static str {
+        
"engine,index,n,nq,d,k,nlist,nprobe,ef_search,build_ms,read_ms,first_query_ms,search_ms,qps,disk_bytes,disk_scope,note"
+    }
+}
+
+impl std::fmt::Display for CsvRow {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{},{},{},{},{},{},{},{},{},{},{},{},{},{:.2},{},{},{}",
+            self.engine,
+            self.index,
+            self.n,
+            self.nq,
+            self.d,
+            self.k,
+            self.nlist,
+            self.nprobe,
+            self.ef_search
+                .map(|value| value.to_string())
+                .unwrap_or_default(),
+            self.build_ms,
+            self.read_ms,
+            self.first_query_ms,
+            self.search_ms,
+            self.qps,
+            self.disk_bytes,
+            self.disk_scope,
+            self.note,
+        )
+    }
+}
+
+fn run_paimon_ivfpq(
+    cfg: &Config,
+    dataset: &Dataset,
+    ids: &[i64],
+    workspace: &Path,
+) -> io::Result<()> {
+    let path = workspace.join("paimon_ivfpq.index");
+    let start = Instant::now();
+    let mut index = IVFPQIndex::new(cfg.d, cfg.nlist, cfg.pq_m, 
MetricType::L2, false);
+    index.train(&dataset.data, cfg.n);
+    index.add(&dataset.data, ids, cfg.n);
+    index.build_search_structures();
+    index.build_precomputed_table();
+    write_to_file(&path, |writer| write_index(&index, writer))?;
+    let build = start.elapsed();
+    drop(index);
+
+    let start = Instant::now();
+    let file = fs::File::open(&path)?;
+    let mut reader = IVFPQIndexReader::open(file)?;
+    reader.ensure_loaded()?;
+    let read = start.elapsed();
+
+    let first_query =
+        time_first_query(|| reader.search(&dataset.queries[..cfg.d], cfg.k, 
cfg.nprobe))?;
+    let search = time_search(|| {
+        search_batch_reader(&mut reader, &dataset.queries, cfg.nq, cfg.k, 
cfg.nprobe).map(|_| ())
+    })?;
+
+    print_row(
+        cfg,
+        "paimon",
+        "IVF_PQ",
+        None,
+        build,
+        read,
+        first_query,
+        search,
+        path.metadata()?.len(),
+        "index_bytes",
+        "",
+    );
+    Ok(())
+}
+
+fn run_paimon_ivfhnswflat(
+    cfg: &Config,
+    dataset: &Dataset,
+    ids: &[i64],
+    workspace: &Path,
+) -> io::Result<()> {
+    let path = workspace.join("paimon_ivfhnswflat.index");
+    let start = Instant::now();
+    let mut index = IVFHNSWFlatIndex::new(cfg.d, cfg.nlist, MetricType::L2, 
cfg.hnsw_params());
+    index.train(&dataset.data, cfg.n);
+    index.add(&dataset.data, ids, cfg.n);
+    index.build_graphs()?;
+    write_to_file(&path, |writer| write_ivfhnswflat_index(&index, writer))?;
+    let build = start.elapsed();
+    drop(index);
+
+    let start = Instant::now();
+    let file = fs::File::open(&path)?;
+    let mut reader = IVFHNSWFlatIndexReader::open(file)?;
+    reader.ensure_loaded()?;
+    let read = start.elapsed();
+
+    let first_query = time_first_query(|| {
+        reader.search(
+            &dataset.queries[..cfg.d],
+            cfg.k,
+            cfg.nprobe,
+            cfg.hnsw_ef_search,
+        )
+    })?;
+    let search = time_search(|| {
+        search_batch_ivfhnswflat_reader(
+            &mut reader,
+            &dataset.queries,
+            cfg.nq,
+            cfg.k,
+            cfg.nprobe,
+            cfg.hnsw_ef_search,
+        )
+        .map(|_| ())
+    })?;
+
+    print_row(
+        cfg,
+        "paimon",
+        "IVF_HNSW_FLAT",
+        Some(cfg.hnsw_ef_search),
+        build,
+        read,
+        first_query,
+        search,
+        path.metadata()?.len(),
+        "index_bytes",
+        "",
+    );
+    Ok(())
+}
+
+fn run_paimon_ivfhnswsq(
+    cfg: &Config,
+    dataset: &Dataset,
+    ids: &[i64],
+    workspace: &Path,
+) -> io::Result<()> {
+    let path = workspace.join("paimon_ivfhnswsq.index");
+    let start = Instant::now();
+    let mut index = IVFHNSWSQIndex::new(cfg.d, cfg.nlist, MetricType::L2, 
cfg.hnsw_params());
+    index.train(&dataset.data, cfg.n);
+    index.add(&dataset.data, ids, cfg.n);
+    index.build_graphs()?;
+    write_to_file(&path, |writer| write_ivfhnswsq_index(&index, writer))?;
+    let build = start.elapsed();
+    drop(index);
+
+    let start = Instant::now();
+    let file = fs::File::open(&path)?;
+    let mut reader = IVFHNSWSQIndexReader::open(file)?;
+    reader.ensure_loaded()?;
+    let read = start.elapsed();
+
+    let first_query = time_first_query(|| {
+        reader.search(
+            &dataset.queries[..cfg.d],
+            cfg.k,
+            cfg.nprobe,
+            cfg.hnsw_ef_search,
+        )
+    })?;
+    let search = time_search(|| {
+        search_batch_ivfhnswsq_reader(
+            &mut reader,
+            &dataset.queries,
+            cfg.nq,
+            cfg.k,
+            cfg.nprobe,
+            cfg.hnsw_ef_search,
+        )
+        .map(|_| ())
+    })?;
+
+    print_row(
+        cfg,
+        "paimon",
+        "IVF_HNSW_SQ",
+        Some(cfg.hnsw_ef_search),
+        build,
+        read,
+        first_query,
+        search,
+        path.metadata()?.len(),
+        "index_bytes",
+        "",
+    );
+    Ok(())
+}
+
+fn write_to_file(
+    path: &Path,
+    write: impl FnOnce(&mut PosWriter<&mut fs::File>) -> io::Result<()>,
+) -> io::Result<()> {
+    let mut file = fs::File::create(path)?;
+    let mut writer = PosWriter::new(&mut file);
+    write(&mut writer)
+}
+
+fn time_first_query<T>(query: impl FnOnce() -> io::Result<T>) -> 
io::Result<Duration> {
+    let start = Instant::now();
+    query()?;
+    Ok(start.elapsed())
+}
+
+fn time_search(search: impl FnOnce() -> io::Result<()>) -> 
io::Result<Duration> {
+    let start = Instant::now();
+    search()?;
+    Ok(start.elapsed())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn print_row(
+    cfg: &Config,
+    engine: &'static str,
+    index: &'static str,
+    ef_search: Option<usize>,
+    build: Duration,
+    read: Duration,
+    first_query: Duration,
+    search: Duration,
+    disk_bytes: u64,
+    disk_scope: &'static str,
+    note: &'static str,
+) {
+    println!(
+        "{}",
+        CsvRow {
+            engine,
+            index,
+            n: cfg.n,
+            nq: cfg.nq,
+            d: cfg.d,
+            k: cfg.k,
+            nlist: cfg.nlist,
+            nprobe: cfg.nprobe,
+            ef_search,
+            build_ms: build.as_millis(),
+            read_ms: read.as_millis(),
+            first_query_ms: first_query.as_millis(),
+            search_ms: search.as_millis(),
+            qps: cfg.nq as f64 / search.as_secs_f64(),
+            disk_bytes,
+            disk_scope,
+            note,
+        }
+    );
+}
+
+fn read_env<T>(name: &str, default: T) -> Result<T, Box<dyn std::error::Error>>
+where
+    T: std::str::FromStr,
+    T::Err: std::error::Error + 'static,
+{
+    match env::var(name) {
+        Ok(value) => Ok(value.parse()?),
+        Err(env::VarError::NotPresent) => Ok(default),
+        Err(err) => Err(Box::new(err)),
+    }
+}
+
+fn prepare_workspace(output_dir: &Path) -> io::Result<PathBuf> {
+    let workspace = output_dir.join(format!("{}", std::process::id()));
+    if workspace.exists() {
+        fs::remove_dir_all(&workspace)?;
+    }
+    fs::create_dir_all(&workspace)?;
+    Ok(workspace)
+}

Reply via email to