(paimon) branch master updated: [python] Fix tantivy full-text index schema mismatch (#8113)

lzljs3620320 Wed, 03 Jun 2026 23:30:30 -0700

This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git



The following commit(s) were added to refs/heads/master by this push:
     new 3721ae0f88 [python] Fix tantivy full-text index schema mismatch  
(#8113)
3721ae0f88 is described below

commit 3721ae0f88c4b739566215f0af7db96238d1a620
Author: XiaoHongbo <[email protected]>
AuthorDate: Thu Jun 4 14:30:11 2026 +0800

    [python] Fix tantivy full-text index schema mismatch  (#8113)
---
 .../tantivy_full_text_global_index_reader.py       | 43 +++++++++++----
 .../pypaimon/tests/vector_search_filter_test.py    | 63 ++++++++++++++++++++--
 2 files changed, 92 insertions(+), 14 deletions(-)

diff --git 
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
 
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
index daf1022d95..5ed0b9a91c 100644
--- 
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
+++ 
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
@@ -22,6 +22,7 @@ backed by a stream-based Directory. No temp files are created 
on disk.
 """
 
 import json
+import logging
 import os
 import struct
 import threading
@@ -34,6 +35,8 @@ from pypaimon.globalindex.vector_search_result import (
 )
 from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
 
+logger = logging.getLogger(__name__)
+
 TANTIVY_FULLTEXT_IDENTIFIER = "tantivy-fulltext"
 TANTIVY_NGRAM_TOKENIZER = "paimon_ngram"
 TANTIVY_JIEBA_TOKENIZER = "paimon_jieba"
@@ -310,13 +313,25 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
                 file_names, file_offsets, file_lengths = 
self._parse_archive_header(stream)
                 directory = StreamDirectory(stream, file_names, file_offsets, 
file_lengths)
 
-                schema_builder = tantivy.SchemaBuilder()
-                schema_builder.add_unsigned_field("row_id", stored=False, 
indexed=True, fast=True)
-                self._add_text_field(schema_builder)
-                schema = schema_builder.build()
-
+                schema = self._build_schema(tantivy)
+                try:
+                    self._index = tantivy.Index(
+                        schema, directory=directory,
+                    )
+                except ValueError as e:
+                    if "schema does not match" not in str(e):
+                        raise
+                    logger.warning(
+                        "Schema mismatch, retrying with "
+                        "row_id stored=true"
+                    )
+                    schema = self._build_schema(
+                        tantivy, row_id_stored=True,
+                    )
+                    self._index = tantivy.Index(
+                        schema, directory=directory,
+                    )
                 self._schema = schema
-                self._index = tantivy.Index(schema, directory=directory)
                 self._register_tokenizer(tantivy, self._index)
                 self._index.reload()
                 self._searcher = self._index.searcher()
@@ -325,17 +340,25 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
                 stream.close()
                 raise
 
-    def _add_text_field(self, schema_builder):
+    def _build_schema(self, tantivy, row_id_stored=False):
+        schema_builder = tantivy.SchemaBuilder()
+        schema_builder.add_unsigned_field(
+            "row_id", stored=row_id_stored, indexed=True, fast=True,
+        )
         tokenizer_name = self._index_options.tokenizer_name()
         field_kwargs = {}
         if not self._index_options.with_position:
             field_kwargs["index_option"] = "freq"
         if tokenizer_name == "default":
-            schema_builder.add_text_field("text", stored=False, **field_kwargs)
+            schema_builder.add_text_field(
+                "text", stored=False, **field_kwargs,
+            )
         else:
             schema_builder.add_text_field(
-                "text", stored=False, tokenizer_name=tokenizer_name,
-                **field_kwargs)
+                "text", stored=False,
+                tokenizer_name=tokenizer_name, **field_kwargs,
+            )
+        return schema_builder.build()
 
     def _register_tokenizer(self, tantivy, index):
         if (self._index_options.tokenizer == "default"
diff --git a/paimon-python/pypaimon/tests/vector_search_filter_test.py 
b/paimon-python/pypaimon/tests/vector_search_filter_test.py
index e5b48282cb..2932ea8dac 100644
--- a/paimon-python/pypaimon/tests/vector_search_filter_test.py
+++ b/paimon-python/pypaimon/tests/vector_search_filter_test.py
@@ -177,7 +177,7 @@ class _FakeSchemaBuilder:
         self.fields = {}
 
     def add_unsigned_field(self, name, stored=False, indexed=True, fast=False):
-        self.fields[name] = {"fast": fast}
+        self.fields[name] = {"fast": fast, "stored": stored}
 
     def add_text_field(self, name, stored=False, tokenizer_name=None, 
**kwargs):
         if "index_option" in kwargs and kwargs["index_option"] is None:
@@ -531,7 +531,7 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
             else:
                 sys.modules["tantivy"] = old_tantivy
 
-        self.assertEqual({"row_id": {"fast": True},
+        self.assertEqual({"row_id": {"fast": True, "stored": False},
                           "text": {"stored": False,
                                    "tokenizer_name": TANTIVY_NGRAM_TOKENIZER}},
                          tantivy.last_schema.fields)
@@ -544,6 +544,61 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
         query = tantivy.last_index.searcher_instance.query
         self.assertEqual(("中文", ("text",), {}), query)
 
+    def test_schema_fallback_for_pre_7670_indexes(self):
+        from pypaimon.globalindex.full_text_search import FullTextSearch
+        from 
pypaimon.globalindex.tantivy.tantivy_full_text_global_index_reader import (
+            TantivyFullTextGlobalIndexReader,
+        )
+
+        call_count = [0]
+
+        class _FakeTantivyWithSchemaFallback(_FakeTantivy):
+            def __init__(self_outer):
+                super().__init__()
+                parent = self_outer
+
+                class SchemaBuilder(_FakeSchemaBuilder):
+                    def build(self_inner):
+                        parent.last_schema = super().build()
+                        return parent.last_schema
+
+                class Index(_FakeIndex):
+                    def __init__(self_inner, schema, directory=None):
+                        call_count[0] += 1
+                        row_id_opts = schema.fields.get("row_id", {})
+                        if not row_id_opts.get("stored", False):
+                            raise ValueError(
+                                "Schema error: 'An index exists but "
+                                "the schema does not match.'"
+                            )
+                        super().__init__(schema, directory=directory)
+                        parent.last_index = self_inner
+
+                self_outer.SchemaBuilder = SchemaBuilder
+                self_outer.Index = Index
+
+        tantivy = _FakeTantivyWithSchemaFallback()
+        old_tantivy = sys.modules.get("tantivy")
+        sys.modules["tantivy"] = tantivy
+        try:
+            reader = TantivyFullTextGlobalIndexReader(
+                _FakeFileIO(), "/unused",
+                [GlobalIndexIOMeta(file_name="ft.index", file_size=1)])
+            try:
+                reader.visit_full_text_search(
+                    FullTextSearch("hello", 5, "content")).result()
+            finally:
+                reader.close()
+        finally:
+            if old_tantivy is None:
+                sys.modules.pop("tantivy", None)
+            else:
+                sys.modules["tantivy"] = old_tantivy
+
+        self.assertEqual(2, call_count[0])
+        self.assertTrue(
+            tantivy.last_schema.fields["row_id"].get("stored", False))
+
     def test_custom_analyzer_reader_registers_matching_tantivy_analyzer(self):
         from pypaimon.globalindex.full_text_search import FullTextSearch
         from 
pypaimon.globalindex.tantivy.tantivy_full_text_global_index_reader import (
@@ -577,7 +632,7 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
             else:
                 sys.modules["tantivy"] = old_tantivy
 
-        self.assertEqual({"row_id": {"fast": True},
+        self.assertEqual({"row_id": {"fast": True, "stored": False},
                           "text": {"stored": False,
                                    "tokenizer_name": TANTIVY_CUSTOM_TOKENIZER,
                                    "index_option": "freq"}},
@@ -656,7 +711,7 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
             else:
                 sys.modules["jieba"] = old_jieba
 
-        self.assertEqual({"row_id": {"fast": True},
+        self.assertEqual({"row_id": {"fast": True, "stored": False},
                           "text": {"stored": False,
                                    "tokenizer_name": TANTIVY_JIEBA_TOKENIZER}},
                          tantivy.last_schema.fields)

(paimon) branch master updated: [python] Fix tantivy full-text index schema mismatch (#8113)

Reply via email to