This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 2f06fc96f0 [tantivy] Fix unnecessary .store file generation in
full-text index (#7670)
2f06fc96f0 is described below
commit 2f06fc96f0803f5892e4db0d15797b7be67075b9
Author: ChengHui Chen <[email protected]>
AuthorDate: Sat May 23 22:50:52 2026 +0800
[tantivy] Fix unnecessary .store file generation in full-text index (#7670)
Tantivy is used purely as an inverted index in Paimon, so `.store` files
(raw field values) are never needed or read. The original implementation
mistakenly set `row_id` as `stored=True`, wasting 30% or even more of
archive size per index file.
This PR removes `.set_stored()` from the schema and filters `.store`
files when packing the archive, and updates the Python reader
accordingly.
---
.../tantivy/tantivy_full_text_global_index_reader.py | 13 +++++++++----
paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs | 14 ++++++++------
2 files changed, 17 insertions(+), 10 deletions(-)
diff --git
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
index a95cd6a850..e9d2150cf7 100644
---
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
+++
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
@@ -151,12 +151,17 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
searcher = self._searcher
query = self._index.parse_query(query_text, ["text"])
+
results = searcher.search(query, limit)
+ if not results.hits:
+ return DictBasedScoredIndexResult({})
+
+ doc_addresses = [addr for score, addr in results.hits]
+ scores = [score for score, addr in results.hits]
+ row_ids = searcher.fast_field_values("row_id", doc_addresses)
id_to_scores: Dict[int, float] = {}
- for score, doc_address in results.hits:
- doc = searcher.doc(doc_address)
- row_id = doc["row_id"][0]
+ for row_id, score in zip(row_ids, scores):
id_to_scores[row_id] = score
return DictBasedScoredIndexResult(id_to_scores)
@@ -179,7 +184,7 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
# Open tantivy index from stream-backed directory
schema_builder = tantivy.SchemaBuilder()
- schema_builder.add_unsigned_field("row_id", stored=True,
indexed=True, fast=True)
+ schema_builder.add_unsigned_field("row_id", stored=False,
indexed=True, fast=True)
schema_builder.add_text_field("text", stored=False)
schema = schema_builder.build()
diff --git a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
index f768a0d8a6..aec47eaa85 100644
--- a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
+++ b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
@@ -23,7 +23,7 @@ use jni::JNIEnv;
use std::ptr;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
-use tantivy::schema::{Field, NumericOptions, Schema, TEXT};
+use tantivy::schema::{Field, IndexRecordOption, NumericOptions, Schema,
TextFieldIndexing, TextOptions};
use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy};
use crate::jni_directory::JniDirectory;
@@ -56,12 +56,14 @@ fn build_schema() -> (Schema, Field, Field) {
let mut builder = Schema::builder();
let row_id_field = builder.add_u64_field(
"row_id",
- NumericOptions::default()
- .set_stored()
- .set_indexed()
- .set_fast(),
+ NumericOptions::default().set_indexed().set_fast(),
);
- let text_field = builder.add_text_field("text", TEXT);
+ let text_options = TextOptions::default().set_indexing_options(
+ TextFieldIndexing::default()
+ .set_tokenizer("default")
+ .set_index_option(IndexRecordOption::WithFreqsAndPositions),
+ );
+ let text_field = builder.add_text_field("text", text_options);
(builder.build(), row_id_field, text_field)
}