This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 2f06fc96f0 [tantivy] Fix unnecessary .store file generation in 
full-text index (#7670)
2f06fc96f0 is described below

commit 2f06fc96f0803f5892e4db0d15797b7be67075b9
Author: ChengHui Chen <[email protected]>
AuthorDate: Sat May 23 22:50:52 2026 +0800

    [tantivy] Fix unnecessary .store file generation in full-text index (#7670)
    
    Tantivy is used purely as an inverted index in Paimon, so `.store` files
    (raw field values) are never needed or read. The original implementation
    mistakenly set `row_id` as `stored=True`, wasting 30% or even more of
    archive size per index file.
    
    This PR removes `.set_stored()` from the schema and filters `.store`
    files when packing the archive, and updates the Python reader
    accordingly.
---
 .../tantivy/tantivy_full_text_global_index_reader.py       | 13 +++++++++----
 paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs          | 14 ++++++++------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git 
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
 
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
index a95cd6a850..e9d2150cf7 100644
--- 
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
+++ 
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
@@ -151,12 +151,17 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
 
         searcher = self._searcher
         query = self._index.parse_query(query_text, ["text"])
+
         results = searcher.search(query, limit)
+        if not results.hits:
+            return DictBasedScoredIndexResult({})
+
+        doc_addresses = [addr for score, addr in results.hits]
+        scores = [score for score, addr in results.hits]
+        row_ids = searcher.fast_field_values("row_id", doc_addresses)
 
         id_to_scores: Dict[int, float] = {}
-        for score, doc_address in results.hits:
-            doc = searcher.doc(doc_address)
-            row_id = doc["row_id"][0]
+        for row_id, score in zip(row_ids, scores):
             id_to_scores[row_id] = score
 
         return DictBasedScoredIndexResult(id_to_scores)
@@ -179,7 +184,7 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
 
             # Open tantivy index from stream-backed directory
             schema_builder = tantivy.SchemaBuilder()
-            schema_builder.add_unsigned_field("row_id", stored=True, 
indexed=True, fast=True)
+            schema_builder.add_unsigned_field("row_id", stored=False, 
indexed=True, fast=True)
             schema_builder.add_text_field("text", stored=False)
             schema = schema_builder.build()
 
diff --git a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs 
b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
index f768a0d8a6..aec47eaa85 100644
--- a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
+++ b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
@@ -23,7 +23,7 @@ use jni::JNIEnv;
 use std::ptr;
 use tantivy::collector::TopDocs;
 use tantivy::query::QueryParser;
-use tantivy::schema::{Field, NumericOptions, Schema, TEXT};
+use tantivy::schema::{Field, IndexRecordOption, NumericOptions, Schema, 
TextFieldIndexing, TextOptions};
 use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy};
 
 use crate::jni_directory::JniDirectory;
@@ -56,12 +56,14 @@ fn build_schema() -> (Schema, Field, Field) {
     let mut builder = Schema::builder();
     let row_id_field = builder.add_u64_field(
         "row_id",
-        NumericOptions::default()
-            .set_stored()
-            .set_indexed()
-            .set_fast(),
+        NumericOptions::default().set_indexed().set_fast(),
     );
-    let text_field = builder.add_text_field("text", TEXT);
+    let text_options = TextOptions::default().set_indexing_options(
+        TextFieldIndexing::default()
+            .set_tokenizer("default")
+            .set_index_option(IndexRecordOption::WithFreqsAndPositions),
+    );
+    let text_field = builder.add_text_field("text", text_options);
     (builder.build(), row_id_field, text_field)
 }
 

Reply via email to