luoyuxia commented on code in PR #93:
URL: https://github.com/apache/fluss-rust/pull/93#discussion_r2616651086
##########
Cargo.toml:
##########
@@ -36,3 +36,8 @@ tokio = { version = "1.44.2", features = ["full"] }
clap = { version = "4.5.37", features = ["derive"] }
arrow = { version = "57.0.0", features = ["ipc_compression"] }
chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] }
+
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
Review Comment:
then the serde, serde_json in `crates/fluss/Cargo.toml` can be `workspace =
true`
##########
crates/fluss/src/client/table/remote_log.rs:
##########
@@ -254,10 +314,10 @@ impl RemotePendingFetch {
// delete the downloaded local file to free disk
delete_file(file_path).await;
- // Parse log records
+ // Parse log records (remote log contains full data, need client-side
projection)
let mut fetch_records = vec![];
for log_record in &mut LogRecordsBatchs::new(data) {
- fetch_records.extend(log_record.records(&self.read_context)?);
+
fetch_records.extend(log_record.records_for_remote_log(&self.read_context)?);
Review Comment:
Thanks for catching that.
##########
crates/fluss/src/client/table/remote_log.rs:
##########
@@ -184,13 +223,34 @@ impl RemoteLogDownloader {
// opendal::Reader::read accepts a range, so we read in chunks
const CHUNK_SIZE: u64 = 8 * 1024 * 1024; // 8MB chunks for efficient
reading
let mut offset = 0u64;
+ let mut chunk_count = 0u64;
+ let total_chunks = file_size.div_ceil(CHUNK_SIZE);
while offset < file_size {
let end = std::cmp::min(offset + CHUNK_SIZE, file_size);
let range = offset..end;
-
- // Read chunk from remote storage
- let chunk =
op.read_with(relative_path).range(range.clone()).await?;
+ chunk_count += 1;
+
+ if chunk_count <= 3 || chunk_count % 10 == 0 {
+ eprintln!(
Review Comment:
I'm guessing it's to track the download progress. But why use `eprintln`
which is for error print.
##########
Cargo.toml:
##########
@@ -36,3 +36,8 @@ tokio = { version = "1.44.2", features = ["full"] }
clap = { version = "4.5.37", features = ["derive"] }
arrow = { version = "57.0.0", features = ["ipc_compression"] }
chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] }
+
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+opendal = { version = "0.53", features = ["services-s3"] }
Review Comment:
Can it be
opendal = version = "0.53
directly?
Since we already enable the feature in `crates/fluss/Cargo.toml`
##########
crates/fluss/src/client/table/remote_log.rs:
##########
@@ -115,11 +116,19 @@ impl RemoteLogDownloadFuture {
/// Downloader for remote log segment files
pub struct RemoteLogDownloader {
local_log_dir: TempDir,
+ s3_props: RwLock<HashMap<String, String>>,
Review Comment:
In here, we hard code to s3 props, but what if the remote storage is not s3?
##########
crates/fluss/src/client/table/remote_log.rs:
##########
@@ -169,12 +185,35 @@ impl RemoteLogDownloader {
// Create FileIO from the remote log tablet dir URL to get the storage
let file_io_builder = FileIO::from_url(&remote_log_tablet_dir_url)?;
+ // For S3/S3A URLs, inject S3 credentials from props
+ let file_io_builder = if remote_log_tablet_dir.starts_with("s3://")
+ || remote_log_tablet_dir.starts_with("s3a://")
+ {
+ file_io_builder.with_props(s3_props.iter().map(|(k, v)|
(k.as_str(), v.as_str())))
+ } else {
+ file_io_builder
+ };
+
// Build storage and create operator directly
let storage = Storage::build(file_io_builder)?;
let (op, relative_path) = storage.create(remote_path)?;
- // Get file metadata to know the size
- let meta = op.stat(relative_path).await?;
+ // Timeout for remote storage operations (30 seconds)
+ const REMOTE_OP_TIMEOUT: std::time::Duration =
std::time::Duration::from_secs(30);
+
+ // Get file metadata to know the size with timeout
+ let stat_future = op.stat(relative_path);
+ let meta = tokio::time::timeout(REMOTE_OP_TIMEOUT, stat_future)
Review Comment:
Is the timeout still needed considering we already hava a timeout layer in
s3 operator?
##########
crates/fluss/src/io/storage_s3.rs:
##########
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Result;
+use opendal::Configurator;
+use opendal::Operator;
+use opendal::layers::TimeoutLayer;
+use opendal::services::S3Config;
+use std::collections::HashMap;
+use std::time::Duration;
+
+pub(crate) fn s3_config_build(props: &HashMap<String, String>) ->
Result<Operator> {
+ let config = S3Config::from_iter(props.clone())?;
+ let op = Operator::from_config(config)?.finish();
+
+ // Add timeout layer to prevent hanging on S3 operations
+ let timeout_layer = TimeoutLayer::new()
+ .with_timeout(Duration::from_secs(10))
+ .with_io_timeout(Duration::from_secs(30));
+
+ Ok(op.layer(timeout_layer))
+}
+
+pub(crate) fn parse_s3_path(path: &str) -> (&str, &str) {
Review Comment:
Seems it's not used?
##########
crates/examples/Cargo.toml:
##########
@@ -26,7 +26,10 @@ version = { workspace = true }
[dependencies]
fluss = { workspace = true }
tokio = { workspace = true }
-clap = { workspace = true}
+clap = { workspace = true }
+serde = { workspace = true }
Review Comment:
Seems `serde`, `serde_json`, `opendal` is not used, can be removed?
##########
crates/fluss/src/client/table/scanner.rs:
##########
@@ -256,6 +259,11 @@ impl LogFetcher {
if let Some(ref remote_log_fetch_info) =
fetch_log_for_bucket.remote_log_fetch_info
{
+ let s3_props = self
Review Comment:
seems we always set s3 props, but what if it's not s3 as remote storage.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]