yahoNanJing commented on code in PR #256:
URL: https://github.com/apache/arrow-ballista/pull/256#discussion_r976610719
##########
ballista/rust/core/src/execution_plans/shuffle_reader.rs:
##########
@@ -102,29 +107,35 @@ impl ExecutionPlan for ShuffleReaderExec {
fn execute(
&self,
partition: usize,
- _context: Arc<TaskContext>,
+ context: Arc<TaskContext>,
) -> Result<SendableRecordBatchStream> {
- debug!("ShuffleReaderExec::execute({})", partition);
-
- let fetch_time =
- MetricBuilder::new(&self.metrics).subset_time("fetch_time",
partition);
-
- let locations = self.partition[partition].clone();
- let stream = locations.into_iter().map(move |p| {
- let fetch_time = fetch_time.clone();
- futures::stream::once(async move {
- let timer = fetch_time.timer();
- let r = fetch_partition(&p).await;
- timer.done();
-
- r.map_err(|e| ArrowError::ExternalError(Box::new(e)))
- })
- .try_flatten()
- });
+ let task_id = context.task_id().unwrap_or_else(||
partition.to_string());
+ info!("ShuffleReaderExec::execute({})", task_id);
+
+ // TODO make the maximum size configurable, or make it depends on
global memory control
+ let max_request_num = 50usize;
+ let mut partition_locations = HashMap::new();
+ for p in &self.partition[partition] {
+ partition_locations
+ .entry(p.executor_meta.id.clone())
+ .or_insert_with(Vec::new)
+ .push(p.clone());
+ }
+ // Sort partitions for evenly send fetching partition requests to
avoid hot executors within one task
+ let mut partition_locations: Vec<PartitionLocation> =
partition_locations
+ .into_values()
+ .flat_map(|ps| ps.into_iter().enumerate())
+ .sorted_by(|(p1_idx, _), (p2_idx, _)| Ord::cmp(p1_idx, p2_idx))
+ .map(|(_, p)| p)
+ .collect();
+ // Shuffle partitions for evenly send fetching partition requests to
avoid hot executors within multiple tasks
+ partition_locations.shuffle(&mut thread_rng());
Review Comment:
Sorting first may be helpful for reducing the bias of random chosen. Maybe
it's not necessary.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]