This is an automated email from the ASF dual-hosted git repository. hope pushed a commit to branch release-1.4 in repository https://gitbox.apache.org/repos/asf/paimon.git
commit a1f79399015eadd917a5b437e2aea482f656b8d4 Author: timmyyao <[email protected]> AuthorDate: Tue Mar 31 20:18:27 2026 +0800 [python] Add doc and refine config for using pyjindosdk in pypaimon (#7565) Using pyjindosdk as the default implementation for pypaimon as long as pyjindosdk is installed. Fall back is introduced in the doc. --- docs/content/pypaimon/pyjindosdk-support.md | 56 ++++++++++++++++++++++ paimon-python/pypaimon/common/options/config.py | 4 +- .../pypaimon/filesystem/pyarrow_file_io.py | 23 +++++++-- paimon-python/pypaimon/tests/file_io_test.py | 2 + 4 files changed, 78 insertions(+), 7 deletions(-) diff --git a/docs/content/pypaimon/pyjindosdk-support.md b/docs/content/pypaimon/pyjindosdk-support.md new file mode 100644 index 0000000000..7db012c928 --- /dev/null +++ b/docs/content/pypaimon/pyjindosdk-support.md @@ -0,0 +1,56 @@ +--- +title: "PyJindoSDK Support" +weight: 8 +type: docs +aliases: + - /pypaimon/pyjindosdk-support.html +--- + +# PyJindoSDK Support + +## Introduction + +[JindoSDK](https://github.com/aliyun/alibabacloud-jindodata) is a high-performance storage SDK developed by Alibaba Cloud for accessing OSS (Object Storage Service) and other cloud storage systems. It provides optimized I/O performance and deep integration with the Alibaba Cloud ecosystem. + +PyPaimon now supports using [PyJindoSDK](https://github.com/aliyun/alibabacloud-jindodata) (the Python binding of JindoSDK) to access OSS. Compared to the legacy implementation based on PyArrow's S3FileSystem, PyJindoSDK offers better performance and compatibility when working with OSS. + +## Usage + +### Installation + +Install `pyjindosdk` via pip: + +```shell +pip install pyjindosdk +``` + +Once installed, PyPaimon will automatically use PyJindoSDK as the default file I/O implementation for accessing OSS. No additional configuration is required. + +### Fallback to Legacy Implementation + +Since JindoSDK is a native implementation, pre-built Python packages may not be available for all OS or platform versions. If you need to fall back to the legacy PyArrow-based implementation for any reason, there are two ways to do so: + +**Option 1: Set catalog option `fs.oss.impl` to `legacy`** + +```python +from pypaimon import CatalogFactory + +catalog_options = { + 'metastore': 'rest', + 'uri': 'http://rest-server:8080', + 'warehouse': 'oss://my-bucket/warehouse', + + # Fallback to the legacy PyArrow S3FileSystem implementation + 'fs.oss.impl': 'legacy', +} + +catalog = CatalogFactory.create(catalog_options) +``` + +**Option 2: Uninstall pyjindosdk** + +Simply uninstalling the `pyjindosdk` package will cause PyPaimon to automatically fall back to the legacy implementation: + +```shell +pip uninstall pyjindosdk +``` diff --git a/paimon-python/pypaimon/common/options/config.py b/paimon-python/pypaimon/common/options/config.py index 83d46c85bf..249ad810a2 100644 --- a/paimon-python/pypaimon/common/options/config.py +++ b/paimon-python/pypaimon/common/options/config.py @@ -18,8 +18,8 @@ from pypaimon.common.options.config_options import ConfigOptions class OssOptions: - OSS_IMPL = ConfigOptions.key("fs.oss.impl").string_type().default_value("default").with_description( - "OSS filesystem implementation: default or jindo") + OSS_IMPL = ConfigOptions.key("fs.oss.impl").string_type().default_value("jindo").with_description( + "OSS filesystem implementation: legacy or jindo") OSS_ACCESS_KEY_ID = ConfigOptions.key("fs.oss.accessKeyId").string_type().no_default_value().with_description( "OSS access key ID") OSS_ACCESS_KEY_SECRET = ConfigOptions.key( diff --git a/paimon-python/pypaimon/filesystem/pyarrow_file_io.py b/paimon-python/pypaimon/filesystem/pyarrow_file_io.py index 6cf2faabb2..87d11e55f2 100644 --- a/paimon-python/pypaimon/filesystem/pyarrow_file_io.py +++ b/paimon-python/pypaimon/filesystem/pyarrow_file_io.py @@ -34,7 +34,7 @@ from pypaimon.common.file_io import FileIO from pypaimon.common.options import Options from pypaimon.common.options.config import OssOptions, S3Options from pypaimon.common.uri_reader import UriReaderFactory -from pypaimon.filesystem.jindo_file_system_handler import JindoFileSystemHandler +from pypaimon.filesystem.jindo_file_system_handler import JindoFileSystemHandler, JINDO_AVAILABLE from pypaimon.schema.data_types import (AtomicType, DataField, PyarrowFieldParser) from pypaimon.table.row.blob import Blob, BlobData, BlobDescriptor @@ -57,12 +57,24 @@ class PyArrowFileIO(FileIO): self.uri_reader_factory = UriReaderFactory(catalog_options) self._is_oss = scheme in {"oss"} self._oss_bucket = None - self._oss_impl = self.properties.get(OssOptions.OSS_IMPL) + _oss_impl = self.properties.get(OssOptions.OSS_IMPL) + self._use_jindo = False + if self._is_oss: self._oss_bucket = self._extract_oss_bucket(path) - if self._oss_impl == "jindo": + if _oss_impl not in ("jindo", "legacy"): + raise ValueError( + f"Unsupported fs.oss.impl value: '{_oss_impl}'. " + f"Supported values are 'jindo' and 'legacy'.") + if _oss_impl == "legacy": + self.filesystem = self._initialize_oss_fs(path) + elif JINDO_AVAILABLE: self.filesystem = self._initialize_jindo_fs(path) else: + self.logger.info( + "fs.oss.impl is 'jindo' but pyjindosdk is not installed. " + "Falling back to legacy PyArrow S3FileSystem implementation. " + "Install pyjindosdk for better performance: pip install pyjindosdk") self.filesystem = self._initialize_oss_fs(path) elif scheme in {"s3", "s3a", "s3n"}: self.filesystem = self._initialize_s3_fs() @@ -126,6 +138,7 @@ class PyArrowFileIO(FileIO): self.logger.info(f"Initializing JindoFileSystem for OSS access: {path}") root_path = f"oss://{self._oss_bucket}/" fs_handler = JindoFileSystemHandler(root_path, self.properties) + self._use_jindo = True return pafs.PyFileSystem(fs_handler) def _initialize_oss_fs(self, path) -> FileSystem: @@ -210,7 +223,7 @@ class PyArrowFileIO(FileIO): def new_output_stream(self, path: str): path_str = self.to_filesystem_path(path) - if self._oss_impl == "jindo": + if self._use_jindo: pass elif self._is_oss and not self._pyarrow_gte_7: # For PyArrow 6.x + OSS, path_str is already just the key part @@ -574,7 +587,7 @@ class PyArrowFileIO(FileIO): path_part = normalized_path.lstrip('/') return f"{drive_letter}:/{path_part}" if path_part else f"{drive_letter}:" - if self._oss_impl == "jindo": + if self._use_jindo: # For JindoFileSystem, pass key only path_part = normalized_path.lstrip('/') return path_part if path_part else '.' diff --git a/paimon-python/pypaimon/tests/file_io_test.py b/paimon-python/pypaimon/tests/file_io_test.py index 5cc4d7a821..d39d0c6461 100644 --- a/paimon-python/pypaimon/tests/file_io_test.py +++ b/paimon-python/pypaimon/tests/file_io_test.py @@ -70,6 +70,7 @@ class FileIOTest(unittest.TestCase): OssOptions.OSS_ENDPOINT.key(): 'oss-cn-hangzhou.aliyuncs.com', OssOptions.OSS_ACCESS_KEY_ID.key(): 'test-key', OssOptions.OSS_ACCESS_KEY_SECRET.key(): 'test-secret', + OssOptions.OSS_IMPL.key(): 'legacy', })) got = oss_io.to_filesystem_path("oss://test-bucket/path/to/file.txt") self.assertEqual(got, "path/to/file.txt" if lt7 else "test-bucket/path/to/file.txt") @@ -291,6 +292,7 @@ class FileIOTest(unittest.TestCase): OssOptions.OSS_ENDPOINT.key(): 'oss-cn-hangzhou.aliyuncs.com', OssOptions.OSS_ACCESS_KEY_ID.key(): 'test-key', OssOptions.OSS_ACCESS_KEY_SECRET.key(): 'test-secret', + OssOptions.OSS_IMPL.key(): 'legacy', })) mock_fs = MagicMock() mock_fs.get_file_info.return_value = [
