klboke opened a new issue, #6962:
URL: https://github.com/apache/paimon/issues/6962

   ### Search before asking
   
   - [x] I searched in the [issues](https://github.com/apache/paimon/issues) 
and found nothing similar.
   
   
   ### Paimon version
   
   1.3
   
   ### Compute Engine
   
   spark
   
   ### Minimal reproduce step
   
   ```python
   # reproduce_paimon_error.py
   import configparser
   import os
   import traceback
   import logging
   
   from pypaimon.catalog.catalog_context import CatalogContext
   from pypaimon.catalog.rest.rest_catalog import RESTCatalog
   from pypaimon.api.options import Options
   
   # --- 复用 paimon_dataset_v3.py 中的配置加载和日志逻辑 ---
   
   # 1. 设置一个简单的日志记录器
   logger = logging.getLogger("PaimonTest")
   if not logger.handlers:
       handler = logging.StreamHandler()
       fmt = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s: 
%(message)s")
       handler.setFormatter(fmt)
       logger.addHandler(handler)
   logger.setLevel(logging.INFO)
   
   
   # 2. 复用配置加载函数,确保连接信息一致
   def _load_paimon_catalog_config() -> dict:
       """从环境变量与配置文件加载 Paimon catalog 配置;环境变量优先。"""
       config = {
           'metastore': 'rest',  # rest, hive, filesystem
           'uri': None,
           'warehouse': None,
           'dlf.region': None,
           'token.provider': None,
           'dlf.access-key-id': None,
           'dlf.access-key-secret': None,
       }
   
       # 从配置文件读取
       default_path = os.getenv("HOME", "/home/admin") + "/.paimon_config.ini"
       config_path = os.getenv('PAIMON_CONFIG_PATH', default_path)
   
       if os.path.exists(config_path):
           try:
               parser = configparser.ConfigParser()
               parser.read(config_path)
               mappings = {
                   'metastore': ('metastore', str),
                   'uri': ('uri', str),
                   'warehouse': ('warehouse', str),
                   'dlf.region': ('dlf.region', str),
                   'token.provider': ('token.provider', str),
                   'dlf.access-key-id': ('dlf.access-key-id', str),
                   'dlf.access-key-secret': ('dlf.access-key-secret', str),
               }
               for file_key, (cfg_key, cast) in mappings.items():
                   if parser.has_option('DEFAULT', file_key):
                       config[cfg_key] = cast(parser.get('DEFAULT', file_key))
           except Exception as e:
               logger.warning("Failed to load Paimon config from %s: %s", 
config_path, e)
       else:
           logger.info("Paimon config file not found: %s", config_path)
   
       # 环境变量覆盖
       env_mappings = {
           'PAIMON_METASTORE': ('metastore', str),
           'PAIMON_URI': ('uri', str),
           'PAIMON_WAREHOUSE': ('warehouse', str),
           'DLF_REGION': ('dlf.region', str),
           'PAIMON_TOKEN_PROVIDER': ('token.provider', str),
           'DLF_ACCESS_KEY_ID': ('dlf.access-key-id', str),
           'DLF_ACCESS_KEY_SECRET': ('dlf.access-key-secret', str),
       }
       for env_key, (cfg_key, cast) in env_mappings.items():
           v = os.getenv(env_key)
           if v:
               config[cfg_key] = cast(v)
   
       return config
   
   # --- 核心复现逻辑 ---
   
   def main():
       """
       直接调用 pypaimon API 复现 IndexError 的核心逻辑。
       """
       # --- 参考 paimon_dataset_test_v3.py 设置配置 ---
       os.environ['PAIMON_CONFIG_PATH'] = 
'/Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini'
       TABLE_TO_TEST = "adn.wide_table_200cols/dt=2025-09-01"
       # ----------------------------------------------
   
       logger.info(f"开始测试,目标表: {TABLE_TO_TEST}")
       logger.info(f"使用配置文件: {os.environ.get('PAIMON_CONFIG_PATH')}")
       logger.info("第一步: 加载 Paimon catalog 配置...")
   
       try:
           # 1. 加载配置并创建 Catalog
           cfg = _load_paimon_catalog_config()
           options = {k: v for k, v in cfg.items() if v is not None}
   
           catalog = 
RESTCatalog(CatalogContext.create_from_options(Options(options)))
           logger.info("Catalog 创建成功. uri=%s", cfg.get('uri'))
   
           # 2. 获取表对象
           logger.info("第二步: 获取表对象...")
           table_name_full = TABLE_TO_TEST
           table_name = table_name_full.split('/', 1)[0] if '/' in 
table_name_full else table_name_full
           logger.info(f"解析表名: '{table_name_full}' -> '{table_name}'")
           paimon_table = catalog.get_table(table_name)
           logger.info(f"表 '{table_name}' 对象获取成功。")
   
           # 3. 创建扫描器并规划
           # 这是触发错误的步骤,因为它会去读取 Manifest 文件
           logger.info("第三步: 创建扫描并执行 plan() 操作(这将触发 Manifest 读取)...")
           scan = paimon_table.new_read_builder().new_scan()
           
           # scan.plan() 是整个错误的触发点
           scan_plan = scan.plan()
   
           # 如果代码能执行到这里,说明没有复现出错误
           splits = scan_plan.splits()
           logger.info(f"操作成功完成,未发生错误。共找到 {len(splits)} 个 split。")
   
       except IndexError:
           logger.error("成功复现 'IndexError: index out of range'!")
           logger.error("这确认了问题发生在 pypaimon 读取 Paimon 表元数据(Manifest)的底层过程中。")
           traceback.print_exc()
       except Exception as e:
           logger.error(f"发生了预料之外的错误: {e}")
           traceback.print_exc()
   
   
   if __name__ == "__main__":
       main()
   
   ```
   
   ### What doesn't meet your expectations?
   
   ```shell
   [2026-01-06 18:42:07,302] INFO:PaimonTest: 开始测试,目标表: 
adn.wide_table_200cols/dt=2025-09-01
   [2026-01-06 18:42:07,302] INFO:PaimonTest: 使用配置文件: 
/Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini
   [2026-01-06 18:42:07,302] INFO:PaimonTest: 第一步: 加载 Paimon catalog 配置...
   [2026-01-06 18:42:07,384] INFO:PaimonTest: Catalog 创建成功. 
uri=https://cn-shanghai-vpc.dlf.aliyuncs.com
   [2026-01-06 18:42:07,386] INFO:PaimonTest: 第二步: 获取表对象...
   [2026-01-06 18:42:07,386] INFO:PaimonTest: 解析表名: 
'adn.wide_table_200cols/dt=2025-09-01' -> 'adn.wide_table_200cols'
   [2026-01-06 18:42:09,495] INFO:PaimonTest: 表 'adn.wide_table_200cols' 对象获取成功。
   [2026-01-06 18:42:09,496] INFO:PaimonTest: 第三步: 创建扫描并执行 plan() 操作(这将触发 
Manifest 读取)...
   [2026-01-06 18:42:09,934] ERROR:PaimonTest: 成功复现 'IndexError: index out of 
range'!
   [2026-01-06 18:42:09,934] ERROR:PaimonTest: 这确认了问题发生在 pypaimon 读取 Paimon 
表元数据(Manifest)的底层过程中。
   Traceback (most recent call last):
     File "/Users/kl/PycharmProjects/tapio/reproduce_paimon_error.py", line 
115, in main
       scan_plan = scan.plan()
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/table_scan.py",
 line 45, in plan
       return self.starting_scanner.scan()
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py",
 line 77, in scan
       file_entries = self.plan_files()
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py",
 line 95, in plan_files
       return self.read_manifest_entries(manifest_files)
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py",
 line 104, in read_manifest_entries
       max_workers=max_workers)
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py",
 line 57, in read_entries_parallel
       for entries in future_results:
     File 
"/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", 
line 586, in result_iterator
       yield fs.pop().result()
     File 
"/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", 
line 432, in result
       return self.__get_result()
     File 
"/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", 
line 384, in __get_result
       raise self._exception
     File 
"/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/thread.py", 
line 56, in run
       result = self.fn(*self.args, **self.kwargs)
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py",
 line 51, in _process_single_manifest
       return self.read(manifest_file.file_name, manifest_entry_filter, 
drop_stats)
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py",
 line 84, in read
       min_values=BinaryRow(key_dict['_MIN_VALUES'], 
self.trimmed_primary_keys_fields),
     File 
"/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/table/row/binary_row.py",
 line 38, in __init__
       self.row_kind = RowKind(self.actual_data[0])
   IndexError: index out of range
   ```
   
   ### Anything else?
   
   -  Python: 3.6.15
   - pyarrow: 6.0.1
   - pypaimon: 1.3.1 
   
    It's worth noting that this issue does not occur with the development 
version `pypaimon==0.3.dev`. I have tested the same code against the `0.3.dev` 
version, and it runs 
        correctly without raising an `IndexError`. 
   
   ### Are you willing to submit a PR?
   
   - [ ] I'm willing to submit a PR!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to