Re: [PR] Concurrent data file fetching and parallel RecordBatch processing [iceberg-rust]

via GitHub Tue, 13 Aug 2024 06:57:22 -0700


liurenjie1024 commented on code in PR #515:
URL: https://github.com/apache/iceberg-rust/pull/515#discussion_r1715332184



##########
crates/iceberg/src/arrow/reader.rs:
##########
@@ -44,25 +43,39 @@ use crate::error::Result;
 use crate::expr::visitors::bound_predicate_visitor::{visit, 
BoundPredicateVisitor};
 use crate::expr::{BoundPredicate, BoundReference};
 use crate::io::{FileIO, FileMetadata, FileRead};
-use crate::scan::{ArrowRecordBatchStream, FileScanTaskStream};
+use crate::runtime::spawn;
+use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream};
 use crate::spec::{Datum, Schema};
 use crate::{Error, ErrorKind};
 
 /// Builder to create ArrowReader
 pub struct ArrowReaderBuilder {
     batch_size: Option<usize>,
     file_io: FileIO,
+    concurrency_limit_data_files: usize,
 }
 
 impl ArrowReaderBuilder {
     /// Create a new ArrowReaderBuilder
     pub(crate) fn new(file_io: FileIO) -> Self {
+        let num_cpus = std::thread::available_parallelism()
+            .expect("failed to get number of CPUs")

Review Comment:
   I don't think it's a good idea to panic here, return some constant value and 
logging is more appropriate here.



##########
crates/iceberg/src/arrow/reader.rs:
##########
@@ -84,73 +98,113 @@ impl ArrowReaderBuilder {
 pub struct ArrowReader {
     batch_size: Option<usize>,
     file_io: FileIO,
+
+    /// the maximum number of data files that can be fetched at the same time
+    concurrency_limit_data_files: usize,
 }
 
 impl ArrowReader {
     /// Take a stream of FileScanTasks and reads all the files.
     /// Returns a stream of Arrow RecordBatches containing the data from the 
files
-    pub fn read(self, mut tasks: FileScanTaskStream) -> 
crate::Result<ArrowRecordBatchStream> {
+    pub fn read(self, tasks: FileScanTaskStream) -> 
Result<ArrowRecordBatchStream> {
         let file_io = self.file_io.clone();
-
-        Ok(try_stream! {
-            while let Some(task_result) = tasks.next().await {
-                match task_result {
-                    Ok(task) => {
-                        // Collect Parquet column indices from field ids
-                        let mut collector = CollectFieldIdVisitor {
-                            field_ids: HashSet::default(),
-                        };
-                        if let Some(predicates) = task.predicate() {
-                            visit(&mut collector, predicates)?;
+        let batch_size = self.batch_size;
+        let max_concurrent_fetching_datafiles = 
self.concurrency_limit_data_files;
+
+        let (tx, rx) = channel(10);

Review Comment:
   I think this should be the same value as `concurrency_limit_data_files`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Concurrent data file fetching and parallel RecordBatch processing [iceberg-rust]

Reply via email to