This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 3721ae0f88 [python] Fix tantivy full-text index schema mismatch
(#8113)
3721ae0f88 is described below
commit 3721ae0f88c4b739566215f0af7db96238d1a620
Author: XiaoHongbo <[email protected]>
AuthorDate: Thu Jun 4 14:30:11 2026 +0800
[python] Fix tantivy full-text index schema mismatch (#8113)
---
.../tantivy_full_text_global_index_reader.py | 43 +++++++++++----
.../pypaimon/tests/vector_search_filter_test.py | 63 ++++++++++++++++++++--
2 files changed, 92 insertions(+), 14 deletions(-)
diff --git
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
index daf1022d95..5ed0b9a91c 100644
---
a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
+++
b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py
@@ -22,6 +22,7 @@ backed by a stream-based Directory. No temp files are created
on disk.
"""
import json
+import logging
import os
import struct
import threading
@@ -34,6 +35,8 @@ from pypaimon.globalindex.vector_search_result import (
)
from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
+logger = logging.getLogger(__name__)
+
TANTIVY_FULLTEXT_IDENTIFIER = "tantivy-fulltext"
TANTIVY_NGRAM_TOKENIZER = "paimon_ngram"
TANTIVY_JIEBA_TOKENIZER = "paimon_jieba"
@@ -310,13 +313,25 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
file_names, file_offsets, file_lengths =
self._parse_archive_header(stream)
directory = StreamDirectory(stream, file_names, file_offsets,
file_lengths)
- schema_builder = tantivy.SchemaBuilder()
- schema_builder.add_unsigned_field("row_id", stored=False,
indexed=True, fast=True)
- self._add_text_field(schema_builder)
- schema = schema_builder.build()
-
+ schema = self._build_schema(tantivy)
+ try:
+ self._index = tantivy.Index(
+ schema, directory=directory,
+ )
+ except ValueError as e:
+ if "schema does not match" not in str(e):
+ raise
+ logger.warning(
+ "Schema mismatch, retrying with "
+ "row_id stored=true"
+ )
+ schema = self._build_schema(
+ tantivy, row_id_stored=True,
+ )
+ self._index = tantivy.Index(
+ schema, directory=directory,
+ )
self._schema = schema
- self._index = tantivy.Index(schema, directory=directory)
self._register_tokenizer(tantivy, self._index)
self._index.reload()
self._searcher = self._index.searcher()
@@ -325,17 +340,25 @@ class TantivyFullTextGlobalIndexReader(GlobalIndexReader):
stream.close()
raise
- def _add_text_field(self, schema_builder):
+ def _build_schema(self, tantivy, row_id_stored=False):
+ schema_builder = tantivy.SchemaBuilder()
+ schema_builder.add_unsigned_field(
+ "row_id", stored=row_id_stored, indexed=True, fast=True,
+ )
tokenizer_name = self._index_options.tokenizer_name()
field_kwargs = {}
if not self._index_options.with_position:
field_kwargs["index_option"] = "freq"
if tokenizer_name == "default":
- schema_builder.add_text_field("text", stored=False, **field_kwargs)
+ schema_builder.add_text_field(
+ "text", stored=False, **field_kwargs,
+ )
else:
schema_builder.add_text_field(
- "text", stored=False, tokenizer_name=tokenizer_name,
- **field_kwargs)
+ "text", stored=False,
+ tokenizer_name=tokenizer_name, **field_kwargs,
+ )
+ return schema_builder.build()
def _register_tokenizer(self, tantivy, index):
if (self._index_options.tokenizer == "default"
diff --git a/paimon-python/pypaimon/tests/vector_search_filter_test.py
b/paimon-python/pypaimon/tests/vector_search_filter_test.py
index e5b48282cb..2932ea8dac 100644
--- a/paimon-python/pypaimon/tests/vector_search_filter_test.py
+++ b/paimon-python/pypaimon/tests/vector_search_filter_test.py
@@ -177,7 +177,7 @@ class _FakeSchemaBuilder:
self.fields = {}
def add_unsigned_field(self, name, stored=False, indexed=True, fast=False):
- self.fields[name] = {"fast": fast}
+ self.fields[name] = {"fast": fast, "stored": stored}
def add_text_field(self, name, stored=False, tokenizer_name=None,
**kwargs):
if "index_option" in kwargs and kwargs["index_option"] is None:
@@ -531,7 +531,7 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
else:
sys.modules["tantivy"] = old_tantivy
- self.assertEqual({"row_id": {"fast": True},
+ self.assertEqual({"row_id": {"fast": True, "stored": False},
"text": {"stored": False,
"tokenizer_name": TANTIVY_NGRAM_TOKENIZER}},
tantivy.last_schema.fields)
@@ -544,6 +544,61 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
query = tantivy.last_index.searcher_instance.query
self.assertEqual(("中文", ("text",), {}), query)
+ def test_schema_fallback_for_pre_7670_indexes(self):
+ from pypaimon.globalindex.full_text_search import FullTextSearch
+ from
pypaimon.globalindex.tantivy.tantivy_full_text_global_index_reader import (
+ TantivyFullTextGlobalIndexReader,
+ )
+
+ call_count = [0]
+
+ class _FakeTantivyWithSchemaFallback(_FakeTantivy):
+ def __init__(self_outer):
+ super().__init__()
+ parent = self_outer
+
+ class SchemaBuilder(_FakeSchemaBuilder):
+ def build(self_inner):
+ parent.last_schema = super().build()
+ return parent.last_schema
+
+ class Index(_FakeIndex):
+ def __init__(self_inner, schema, directory=None):
+ call_count[0] += 1
+ row_id_opts = schema.fields.get("row_id", {})
+ if not row_id_opts.get("stored", False):
+ raise ValueError(
+ "Schema error: 'An index exists but "
+ "the schema does not match.'"
+ )
+ super().__init__(schema, directory=directory)
+ parent.last_index = self_inner
+
+ self_outer.SchemaBuilder = SchemaBuilder
+ self_outer.Index = Index
+
+ tantivy = _FakeTantivyWithSchemaFallback()
+ old_tantivy = sys.modules.get("tantivy")
+ sys.modules["tantivy"] = tantivy
+ try:
+ reader = TantivyFullTextGlobalIndexReader(
+ _FakeFileIO(), "/unused",
+ [GlobalIndexIOMeta(file_name="ft.index", file_size=1)])
+ try:
+ reader.visit_full_text_search(
+ FullTextSearch("hello", 5, "content")).result()
+ finally:
+ reader.close()
+ finally:
+ if old_tantivy is None:
+ sys.modules.pop("tantivy", None)
+ else:
+ sys.modules["tantivy"] = old_tantivy
+
+ self.assertEqual(2, call_count[0])
+ self.assertTrue(
+ tantivy.last_schema.fields["row_id"].get("stored", False))
+
def test_custom_analyzer_reader_registers_matching_tantivy_analyzer(self):
from pypaimon.globalindex.full_text_search import FullTextSearch
from
pypaimon.globalindex.tantivy.tantivy_full_text_global_index_reader import (
@@ -577,7 +632,7 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
else:
sys.modules["tantivy"] = old_tantivy
- self.assertEqual({"row_id": {"fast": True},
+ self.assertEqual({"row_id": {"fast": True, "stored": False},
"text": {"stored": False,
"tokenizer_name": TANTIVY_CUSTOM_TOKENIZER,
"index_option": "freq"}},
@@ -656,7 +711,7 @@ class TantivyFullTextIndexOptionsTest(unittest.TestCase):
else:
sys.modules["jieba"] = old_jieba
- self.assertEqual({"row_id": {"fast": True},
+ self.assertEqual({"row_id": {"fast": True, "stored": False},
"text": {"stored": False,
"tokenizer_name": TANTIVY_JIEBA_TOKENIZER}},
tantivy.last_schema.fields)