This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new da8f7427c Fix S3 query canonicalization (#2800) (#2801)
da8f7427c is described below
commit da8f7427c27f0d26092d0536d1343421250fc4cf
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Sep 29 15:46:03 2022 +0100
Fix S3 query canonicalization (#2800) (#2801)
* Fix S3 query canonicalization (#2800)
* Disable listing with spaces on azurite and localstack
---
object_store/src/aws/client.rs | 16 ++--------------
object_store/src/aws/credential.rs | 37 ++++++++++++++++++++++++++++++++++++-
object_store/src/aws/mod.rs | 20 ++++++++++++++++++--
object_store/src/azure/mod.rs | 15 ++++++---------
object_store/src/lib.rs | 22 ++++++++++++++++++++++
object_store/src/path/mod.rs | 11 +++++++++++
6 files changed, 95 insertions(+), 26 deletions(-)
diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs
index f800fec3d..5ec9390ec 100644
--- a/object_store/src/aws/client.rs
+++ b/object_store/src/aws/client.rs
@@ -16,6 +16,7 @@
// under the License.
use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider};
+use crate::aws::STRICT_PATH_ENCODE_SET;
use crate::client::pagination::stream_paginated;
use crate::client::retry::RetryExt;
use crate::multipart::UploadPart;
@@ -26,26 +27,13 @@ use crate::{
};
use bytes::{Buf, Bytes};
use chrono::{DateTime, Utc};
-use percent_encoding::{utf8_percent_encode, AsciiSet, PercentEncode,
NON_ALPHANUMERIC};
+use percent_encoding::{utf8_percent_encode, PercentEncode};
use reqwest::{Client as ReqwestClient, Method, Response, StatusCode};
use serde::{Deserialize, Serialize};
use snafu::{ResultExt, Snafu};
use std::ops::Range;
use std::sync::Arc;
-//
http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html
-//
-// Do not URI-encode any of the unreserved characters that RFC 3986 defines:
-// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~
).
-const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC
- .remove(b'-')
- .remove(b'.')
- .remove(b'_')
- .remove(b'~');
-
-/// This struct is used to maintain the URI path encoding
-const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/');
-
/// A specialized `Error` for object store-related errors
#[derive(Debug, Snafu)]
#[allow(missing_docs)]
diff --git a/object_store/src/aws/credential.rs
b/object_store/src/aws/credential.rs
index 1abf42be9..d4461645f 100644
--- a/object_store/src/aws/credential.rs
+++ b/object_store/src/aws/credential.rs
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+use crate::aws::STRICT_ENCODE_SET;
use crate::client::retry::RetryExt;
use crate::client::token::{TemporaryToken, TokenCache};
use crate::util::hmac_sha256;
@@ -22,6 +23,7 @@ use crate::{Result, RetryConfig};
use bytes::Buf;
use chrono::{DateTime, Utc};
use futures::TryFutureExt;
+use percent_encoding::utf8_percent_encode;
use reqwest::header::{HeaderMap, HeaderValue};
use reqwest::{Client, Method, Request, RequestBuilder, StatusCode};
use serde::Deserialize;
@@ -29,6 +31,7 @@ use std::collections::BTreeMap;
use std::sync::Arc;
use std::time::Instant;
use tracing::warn;
+use url::Url;
type StdError = Box<dyn std::error::Error + Send + Sync>;
@@ -103,13 +106,14 @@ impl<'a> RequestSigner<'a> {
request.headers_mut().insert(HASH_HEADER, header_digest);
let (signed_headers, canonical_headers) =
canonicalize_headers(request.headers());
+ let canonical_query = canonicalize_query(request.url());
//
https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html
let canonical_request = format!(
"{}\n{}\n{}\n{}\n{}\n{}",
request.method().as_str(),
request.url().path(), // S3 doesn't percent encode this like other
services
- request.url().query().unwrap_or(""), // This assumes the query
pairs are in order
+ canonical_query,
canonical_headers,
signed_headers,
digest
@@ -207,6 +211,37 @@ fn hex_encode(bytes: &[u8]) -> String {
out
}
+/// Canonicalizes query parameters into the AWS canonical form
+///
+///
<https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html>
+fn canonicalize_query(url: &Url) -> String {
+ use std::fmt::Write;
+
+ let capacity = match url.query() {
+ Some(q) if !q.is_empty() => q.len(),
+ _ => return String::new(),
+ };
+ let mut encoded = String::with_capacity(capacity + 1);
+
+ let mut headers = url.query_pairs().collect::<Vec<_>>();
+ headers.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
+
+ let mut first = true;
+ for (k, v) in headers {
+ if !first {
+ encoded.push('&');
+ }
+ first = false;
+ let _ = write!(
+ encoded,
+ "{}={}",
+ utf8_percent_encode(k.as_ref(), &STRICT_ENCODE_SET),
+ utf8_percent_encode(v.as_ref(), &STRICT_ENCODE_SET)
+ );
+ }
+ encoded
+}
+
/// Canonicalizes headers into the AWS Canonical Form.
///
///
<https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html>
diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs
index d1d0a12cd..d186c7f47 100644
--- a/object_store/src/aws/mod.rs
+++ b/object_store/src/aws/mod.rs
@@ -58,6 +58,20 @@ use crate::{
mod client;
mod credential;
+//
http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html
+//
+// Do not URI-encode any of the unreserved characters that RFC 3986 defines:
+// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~
).
+pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet =
+ percent_encoding::NON_ALPHANUMERIC
+ .remove(b'-')
+ .remove(b'.')
+ .remove(b'_')
+ .remove(b'~');
+
+/// This struct is used to maintain the URI path encoding
+const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet =
STRICT_ENCODE_SET.remove(b'/');
+
/// A specialized `Error` for object store-related errors
#[derive(Debug, Snafu)]
#[allow(missing_docs)]
@@ -551,7 +565,7 @@ mod tests {
use super::*;
use crate::tests::{
get_nonexistent_object, list_uses_directories_correctly,
list_with_delimiter,
- put_get_delete_list, rename_and_copy, stream_get,
+ put_get_delete_list_opts, rename_and_copy, stream_get,
};
use bytes::Bytes;
use std::env;
@@ -677,9 +691,11 @@ mod tests {
#[tokio::test]
async fn s3_test() {
let config = maybe_skip_integration!();
+ let is_local = matches!(&config.endpoint, Some(e) if
e.starts_with("http://"));
let integration = config.build().unwrap();
- put_get_delete_list(&integration).await;
+ // Localstack doesn't support listing with spaces
https://github.com/localstack/localstack/issues/6328
+ put_get_delete_list_opts(&integration, is_local).await;
list_uses_directories_correctly(&integration).await;
list_with_delimiter(&integration).await;
rename_and_copy(&integration).await;
diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs
index dd1cde9c7..f7ca4cf4e 100644
--- a/object_store/src/azure/mod.rs
+++ b/object_store/src/azure/mod.rs
@@ -595,7 +595,7 @@ mod tests {
use super::*;
use crate::tests::{
copy_if_not_exists, list_uses_directories_correctly,
list_with_delimiter,
- put_get_delete_list, rename_and_copy, stream_get,
+ put_get_delete_list, put_get_delete_list_opts, rename_and_copy,
stream_get,
};
use std::env;
@@ -663,9 +663,10 @@ mod tests {
#[tokio::test]
async fn azure_blob_test() {
+ let use_emulator = env::var("AZURE_USE_EMULATOR").is_ok();
let integration = maybe_skip_integration!().build().unwrap();
-
- put_get_delete_list(&integration).await;
+ // Azurite doesn't support listing with spaces -
https://github.com/localstack/localstack/issues/6328
+ put_get_delete_list_opts(&integration, use_emulator).await;
list_uses_directories_correctly(&integration).await;
list_with_delimiter(&integration).await;
rename_and_copy(&integration).await;
@@ -687,13 +688,9 @@ mod tests {
.with_container_name(
env::var("OBJECT_STORE_BUCKET").expect("must be set
OBJECT_STORE_BUCKET"),
)
- .with_client_secret_authorization(
- env::var("AZURE_STORAGE_CLIENT_ID")
+ .with_access_key(
+ env::var("AZURE_STORAGE_ACCESS_KEY")
.expect("must be set AZURE_STORAGE_CLIENT_ID"),
- env::var("AZURE_STORAGE_CLIENT_SECRET")
- .expect("must be set AZURE_STORAGE_CLIENT_SECRET"),
- env::var("AZURE_STORAGE_TENANT_ID")
- .expect("must be set AZURE_STORAGE_TENANT_ID"),
);
let integration = builder.build().unwrap();
diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs
index 16f0c6f3a..5eaaabaf2 100644
--- a/object_store/src/lib.rs
+++ b/object_store/src/lib.rs
@@ -506,6 +506,13 @@ mod tests {
use tokio::io::AsyncWriteExt;
pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) {
+ put_get_delete_list_opts(storage, false).await
+ }
+
+ pub(crate) async fn put_get_delete_list_opts(
+ storage: &DynObjectStore,
+ skip_list_with_spaces: bool,
+ ) {
delete_fixtures(storage).await;
let content_list = flatten_list_stream(storage, None).await.unwrap();
@@ -701,6 +708,21 @@ mod tests {
assert_eq!(files, vec![path.clone()]);
storage.delete(&path).await.unwrap();
+
+ let path = Path::parse("foo bar/I contain spaces.parquet").unwrap();
+ storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap();
+ storage.head(&path).await.unwrap();
+
+ if !skip_list_with_spaces {
+ let files = flatten_list_stream(storage, Some(&Path::from("foo
bar")))
+ .await
+ .unwrap();
+ assert_eq!(files, vec![path.clone()]);
+ }
+ storage.delete(&path).await.unwrap();
+
+ let files = flatten_list_stream(storage, None).await.unwrap();
+ assert!(files.is_empty(), "{:?}", files);
}
fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec<Bytes> {
diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs
index e5a7b6443..80e0f792a 100644
--- a/object_store/src/path/mod.rs
+++ b/object_store/src/path/mod.rs
@@ -534,4 +534,15 @@ mod tests {
needle
);
}
+
+ #[test]
+ fn path_containing_spaces() {
+ let a = Path::from_iter(["foo bar", "baz"]);
+ let b = Path::from("foo bar/baz");
+ let c = Path::parse("foo bar/baz").unwrap();
+
+ assert_eq!(a.raw, "foo bar/baz");
+ assert_eq!(a.raw, b.raw);
+ assert_eq!(b.raw, c.raw);
+ }
}