klboke opened a new issue, #6962: URL: https://github.com/apache/paimon/issues/6962
### Search before asking - [x] I searched in the [issues](https://github.com/apache/paimon/issues) and found nothing similar. ### Paimon version 1.3 ### Compute Engine spark ### Minimal reproduce step ```python # reproduce_paimon_error.py import configparser import os import traceback import logging from pypaimon.catalog.catalog_context import CatalogContext from pypaimon.catalog.rest.rest_catalog import RESTCatalog from pypaimon.api.options import Options # --- 复用 paimon_dataset_v3.py 中的配置加载和日志逻辑 --- # 1. 设置一个简单的日志记录器 logger = logging.getLogger("PaimonTest") if not logger.handlers: handler = logging.StreamHandler() fmt = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s: %(message)s") handler.setFormatter(fmt) logger.addHandler(handler) logger.setLevel(logging.INFO) # 2. 复用配置加载函数,确保连接信息一致 def _load_paimon_catalog_config() -> dict: """从环境变量与配置文件加载 Paimon catalog 配置;环境变量优先。""" config = { 'metastore': 'rest', # rest, hive, filesystem 'uri': None, 'warehouse': None, 'dlf.region': None, 'token.provider': None, 'dlf.access-key-id': None, 'dlf.access-key-secret': None, } # 从配置文件读取 default_path = os.getenv("HOME", "/home/admin") + "/.paimon_config.ini" config_path = os.getenv('PAIMON_CONFIG_PATH', default_path) if os.path.exists(config_path): try: parser = configparser.ConfigParser() parser.read(config_path) mappings = { 'metastore': ('metastore', str), 'uri': ('uri', str), 'warehouse': ('warehouse', str), 'dlf.region': ('dlf.region', str), 'token.provider': ('token.provider', str), 'dlf.access-key-id': ('dlf.access-key-id', str), 'dlf.access-key-secret': ('dlf.access-key-secret', str), } for file_key, (cfg_key, cast) in mappings.items(): if parser.has_option('DEFAULT', file_key): config[cfg_key] = cast(parser.get('DEFAULT', file_key)) except Exception as e: logger.warning("Failed to load Paimon config from %s: %s", config_path, e) else: logger.info("Paimon config file not found: %s", config_path) # 环境变量覆盖 env_mappings = { 'PAIMON_METASTORE': ('metastore', str), 'PAIMON_URI': ('uri', str), 'PAIMON_WAREHOUSE': ('warehouse', str), 'DLF_REGION': ('dlf.region', str), 'PAIMON_TOKEN_PROVIDER': ('token.provider', str), 'DLF_ACCESS_KEY_ID': ('dlf.access-key-id', str), 'DLF_ACCESS_KEY_SECRET': ('dlf.access-key-secret', str), } for env_key, (cfg_key, cast) in env_mappings.items(): v = os.getenv(env_key) if v: config[cfg_key] = cast(v) return config # --- 核心复现逻辑 --- def main(): """ 直接调用 pypaimon API 复现 IndexError 的核心逻辑。 """ # --- 参考 paimon_dataset_test_v3.py 设置配置 --- os.environ['PAIMON_CONFIG_PATH'] = '/Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini' TABLE_TO_TEST = "adn.wide_table_200cols/dt=2025-09-01" # ---------------------------------------------- logger.info(f"开始测试,目标表: {TABLE_TO_TEST}") logger.info(f"使用配置文件: {os.environ.get('PAIMON_CONFIG_PATH')}") logger.info("第一步: 加载 Paimon catalog 配置...") try: # 1. 加载配置并创建 Catalog cfg = _load_paimon_catalog_config() options = {k: v for k, v in cfg.items() if v is not None} catalog = RESTCatalog(CatalogContext.create_from_options(Options(options))) logger.info("Catalog 创建成功. uri=%s", cfg.get('uri')) # 2. 获取表对象 logger.info("第二步: 获取表对象...") table_name_full = TABLE_TO_TEST table_name = table_name_full.split('/', 1)[0] if '/' in table_name_full else table_name_full logger.info(f"解析表名: '{table_name_full}' -> '{table_name}'") paimon_table = catalog.get_table(table_name) logger.info(f"表 '{table_name}' 对象获取成功。") # 3. 创建扫描器并规划 # 这是触发错误的步骤,因为它会去读取 Manifest 文件 logger.info("第三步: 创建扫描并执行 plan() 操作(这将触发 Manifest 读取)...") scan = paimon_table.new_read_builder().new_scan() # scan.plan() 是整个错误的触发点 scan_plan = scan.plan() # 如果代码能执行到这里,说明没有复现出错误 splits = scan_plan.splits() logger.info(f"操作成功完成,未发生错误。共找到 {len(splits)} 个 split。") except IndexError: logger.error("成功复现 'IndexError: index out of range'!") logger.error("这确认了问题发生在 pypaimon 读取 Paimon 表元数据(Manifest)的底层过程中。") traceback.print_exc() except Exception as e: logger.error(f"发生了预料之外的错误: {e}") traceback.print_exc() if __name__ == "__main__": main() ``` ### What doesn't meet your expectations? ```shell [2026-01-06 18:42:07,302] INFO:PaimonTest: 开始测试,目标表: adn.wide_table_200cols/dt=2025-09-01 [2026-01-06 18:42:07,302] INFO:PaimonTest: 使用配置文件: /Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini [2026-01-06 18:42:07,302] INFO:PaimonTest: 第一步: 加载 Paimon catalog 配置... [2026-01-06 18:42:07,384] INFO:PaimonTest: Catalog 创建成功. uri=https://cn-shanghai-vpc.dlf.aliyuncs.com [2026-01-06 18:42:07,386] INFO:PaimonTest: 第二步: 获取表对象... [2026-01-06 18:42:07,386] INFO:PaimonTest: 解析表名: 'adn.wide_table_200cols/dt=2025-09-01' -> 'adn.wide_table_200cols' [2026-01-06 18:42:09,495] INFO:PaimonTest: 表 'adn.wide_table_200cols' 对象获取成功。 [2026-01-06 18:42:09,496] INFO:PaimonTest: 第三步: 创建扫描并执行 plan() 操作(这将触发 Manifest 读取)... [2026-01-06 18:42:09,934] ERROR:PaimonTest: 成功复现 'IndexError: index out of range'! [2026-01-06 18:42:09,934] ERROR:PaimonTest: 这确认了问题发生在 pypaimon 读取 Paimon 表元数据(Manifest)的底层过程中。 Traceback (most recent call last): File "/Users/kl/PycharmProjects/tapio/reproduce_paimon_error.py", line 115, in main scan_plan = scan.plan() File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/table_scan.py", line 45, in plan return self.starting_scanner.scan() File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 77, in scan file_entries = self.plan_files() File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 95, in plan_files return self.read_manifest_entries(manifest_files) File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 104, in read_manifest_entries max_workers=max_workers) File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 57, in read_entries_parallel for entries in future_results: File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 586, in result_iterator yield fs.pop().result() File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 432, in result return self.__get_result() File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 384, in __get_result raise self._exception File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/thread.py", line 56, in run result = self.fn(*self.args, **self.kwargs) File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 51, in _process_single_manifest return self.read(manifest_file.file_name, manifest_entry_filter, drop_stats) File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 84, in read min_values=BinaryRow(key_dict['_MIN_VALUES'], self.trimmed_primary_keys_fields), File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/table/row/binary_row.py", line 38, in __init__ self.row_kind = RowKind(self.actual_data[0]) IndexError: index out of range ``` ### Anything else? - Python: 3.6.15 - pyarrow: 6.0.1 - pypaimon: 1.3.1 It's worth noting that this issue does not occur with the development version `pypaimon==0.3.dev`. I have tested the same code against the `0.3.dev` version, and it runs correctly without raising an `IndexError`. ### Are you willing to submit a PR? - [ ] I'm willing to submit a PR! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
