This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new ad89ff8242 Support tencent cloud COS storage in `datafusion-cli`
(#9734)
ad89ff8242 is described below
commit ad89ff82421e9c4670f4440dd5a6fa6fb55c40c3
Author: Harvey Yue <[email protected]>
AuthorDate: Mon Mar 25 20:57:55 2024 +0800
Support tencent cloud COS storage in `datafusion-cli` (#9734)
* Support tencent cloud COS storage
* Fix clippy
* Update docs/source/user-guide/cli.md
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion-cli/src/catalog.rs | 2 +-
datafusion-cli/src/exec.rs | 16 ++++++++++++
datafusion-cli/src/object_storage.rs | 50 ++++++++++++++++++++++++------------
docs/source/user-guide/cli.md | 37 +++++++++++++++++++++-----
4 files changed, 81 insertions(+), 24 deletions(-)
diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs
index 46dd8bb00f..0fbb7a5908 100644
--- a/datafusion-cli/src/catalog.rs
+++ b/datafusion-cli/src/catalog.rs
@@ -177,7 +177,7 @@ impl SchemaProvider for DynamicFileSchemaProvider {
// Register the store for this URL. Here we don't have access
// to any command options so the only choice is to use an
empty collection
match scheme {
- "s3" | "oss" => {
+ "s3" | "oss" | "cos" => {
state =
state.add_table_options_extension(AwsOptions::default());
}
"gs" | "gcs" => {
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index 4e374a4c00..114e3cefa3 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -415,6 +415,7 @@ mod tests {
let locations = vec![
"s3://bucket/path/file.parquet",
"oss://bucket/path/file.parquet",
+ "cos://bucket/path/file.parquet",
"gcs://bucket/path/file.parquet",
];
let mut ctx = SessionContext::new();
@@ -497,6 +498,21 @@ mod tests {
Ok(())
}
+ #[tokio::test]
+ async fn create_object_store_table_cos() -> Result<()> {
+ let access_key_id = "fake_access_key_id";
+ let secret_access_key = "fake_secret_access_key";
+ let endpoint = "fake_endpoint";
+ let location = "cos://bucket/path/file.parquet";
+
+ // Should be OK
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+ OPTIONS('aws.access_key_id' '{access_key_id}',
'aws.secret_access_key' '{secret_access_key}', 'aws.cos.endpoint' '{endpoint}')
LOCATION '{location}'");
+ create_external_table_test(location, &sql).await?;
+
+ Ok(())
+ }
+
#[tokio::test]
async fn create_object_store_table_gcs() -> Result<()> {
let service_account_path = "fake_service_account_path";
diff --git a/datafusion-cli/src/object_storage.rs
b/datafusion-cli/src/object_storage.rs
index 033c8f839a..94560cb9d8 100644
--- a/datafusion-cli/src/object_storage.rs
+++ b/datafusion-cli/src/object_storage.rs
@@ -19,7 +19,7 @@ use std::any::Any;
use std::fmt::{Debug, Display};
use std::sync::Arc;
-use datafusion::common::{config_namespace, exec_datafusion_err, exec_err,
internal_err};
+use datafusion::common::{exec_datafusion_err, exec_err, internal_err};
use datafusion::error::{DataFusionError, Result};
use datafusion::execution::context::SessionState;
use datafusion::prelude::SessionContext;
@@ -106,12 +106,27 @@ impl CredentialProvider for S3CredentialProvider {
pub fn get_oss_object_store_builder(
url: &Url,
aws_options: &AwsOptions,
+) -> Result<AmazonS3Builder> {
+ get_object_store_builder(url, aws_options, true)
+}
+
+pub fn get_cos_object_store_builder(
+ url: &Url,
+ aws_options: &AwsOptions,
+) -> Result<AmazonS3Builder> {
+ get_object_store_builder(url, aws_options, false)
+}
+
+fn get_object_store_builder(
+ url: &Url,
+ aws_options: &AwsOptions,
+ virtual_hosted_style_request: bool,
) -> Result<AmazonS3Builder> {
let bucket_name = get_bucket_name(url)?;
let mut builder = AmazonS3Builder::from_env()
- .with_virtual_hosted_style_request(true)
+ .with_virtual_hosted_style_request(virtual_hosted_style_request)
.with_bucket_name(bucket_name)
- // oss don't care about the "region" field
+ // oss/cos don't care about the "region" field
.with_region("do_not_care");
if let (Some(access_key_id), Some(secret_access_key)) =
@@ -122,7 +137,7 @@ pub fn get_oss_object_store_builder(
.with_secret_access_key(secret_access_key);
}
- if let Some(endpoint) = &aws_options.oss.endpoint {
+ if let Some(endpoint) = &aws_options.endpoint {
builder = builder.with_endpoint(endpoint);
}
@@ -171,14 +186,8 @@ pub struct AwsOptions {
pub session_token: Option<String>,
/// AWS Region
pub region: Option<String>,
- /// Object Storage Service options
- pub oss: OssOptions,
-}
-
-config_namespace! {
- pub struct OssOptions {
- pub endpoint: Option<String>, default = None
- }
+ /// OSS or COS Endpoint
+ pub endpoint: Option<String>,
}
impl ExtensionOptions for AwsOptions {
@@ -210,8 +219,8 @@ impl ExtensionOptions for AwsOptions {
"region" => {
self.region.set(rem, value)?;
}
- "oss" => {
- self.oss.set(rem, value)?;
+ "oss" | "cos" => {
+ self.endpoint.set(rem, value)?;
}
_ => {
return internal_err!("Config value \"{}\" not found on
AwsOptions", rem);
@@ -252,7 +261,7 @@ impl ExtensionOptions for AwsOptions {
.visit(&mut v, "secret_access_key", "");
self.session_token.visit(&mut v, "session_token", "");
self.region.visit(&mut v, "region", "");
- self.oss.visit(&mut v, "oss", "");
+ self.endpoint.visit(&mut v, "endpoint", "");
v.0
}
}
@@ -376,7 +385,7 @@ pub(crate) fn register_options(ctx: &SessionContext,
scheme: &str) {
// Match the provided scheme against supported cloud storage schemes:
match scheme {
// For Amazon S3 or Alibaba Cloud OSS
- "s3" | "oss" => {
+ "s3" | "oss" | "cos" => {
// Register AWS specific table options in the session context:
ctx.register_table_options_extension(AwsOptions::default())
}
@@ -415,6 +424,15 @@ pub(crate) async fn get_object_store(
let builder = get_oss_object_store_builder(url, options)?;
Arc::new(builder.build()?)
}
+ "cos" => {
+ let Some(options) = table_options.extensions.get::<AwsOptions>()
else {
+ return exec_err!(
+ "Given table options incompatible with the 'cos' scheme"
+ );
+ };
+ let builder = get_cos_object_store_builder(url, options)?;
+ Arc::new(builder.build()?)
+ }
"gs" | "gcs" => {
let Some(options) = table_options.extensions.get::<GcpOptions>()
else {
return exec_err!(
diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md
index a94e2427ea..da4c987054 100644
--- a/docs/source/user-guide/cli.md
+++ b/docs/source/user-guide/cli.md
@@ -312,9 +312,9 @@ select count(*) from hits;
CREATE EXTERNAL TABLE test
STORED AS PARQUET
OPTIONS(
- 'access_key_id' '******',
- 'secret_access_key' '******',
- 'region' 'us-east-2'
+ 'aws.access_key_id' '******',
+ 'aws.secret_access_key' '******',
+ 'aws.region' 'us-east-2'
)
LOCATION 's3://bucket/path/file.parquet';
```
@@ -365,9 +365,9 @@ Details of the environment variables that can be used are:
CREATE EXTERNAL TABLE test
STORED AS PARQUET
OPTIONS(
- 'access_key_id' '******',
- 'secret_access_key' '******',
- 'endpoint' 'https://bucket.oss-cn-hangzhou.aliyuncs.com'
+ 'aws.access_key_id' '******',
+ 'aws.secret_access_key' '******',
+ 'aws.oss.endpoint' 'https://bucket.oss-cn-hangzhou.aliyuncs.com'
)
LOCATION 'oss://bucket/path/file.parquet';
```
@@ -380,6 +380,29 @@ The supported OPTIONS are:
Note that the `endpoint` format of oss needs to be:
`https://{bucket}.{oss-region-endpoint}`
+## Registering COS Data Sources
+
+[Tencent cloud COS](https://cloud.tencent.com/product/cos) data sources can be
registered by executing a `CREATE EXTERNAL TABLE` SQL statement.
+
+```sql
+CREATE EXTERNAL TABLE test
+STORED AS PARQUET
+OPTIONS(
+ 'aws.access_key_id' '******',
+ 'aws.secret_access_key' '******',
+ 'aws.cos.endpoint' 'https://cos.ap-singapore.myqcloud.com'
+)
+LOCATION 'cos://bucket/path/file.parquet';
+```
+
+The supported OPTIONS are:
+
+- access_key_id
+- secret_access_key
+- endpoint
+
+Note that the `endpoint` format of urls must be:
`https://cos.{cos-region-endpoint}`
+
## Registering GCS Data Sources
[Google Cloud Storage](https://cloud.google.com/storage) data sources can be
registered by executing a `CREATE EXTERNAL TABLE` SQL statement.
@@ -388,7 +411,7 @@ Note that the `endpoint` format of oss needs to be:
`https://{bucket}.{oss-regio
CREATE EXTERNAL TABLE test
STORED AS PARQUET
OPTIONS(
- 'service_account_path' '/tmp/gcs.json',
+ 'gcp.service_account_path' '/tmp/gcs.json',
)
LOCATION 'gs://bucket/path/file.parquet';
```