danielcweeks commented on code in PR #7688:
URL: https://github.com/apache/iceberg/pull/7688#discussion_r1206095696
##########
core/src/main/java/org/apache/iceberg/BaseScan.java:
##########
@@ -256,4 +265,95 @@ private static Schema
lazyColumnProjection(TableScanContext context, Schema sche
public ThisT metricsReporter(MetricsReporter reporter) {
return newRefinedScan(table(), schema(), context().reportWith(reporter));
}
+
+ private Optional<Long> adaptiveSplitSize(long tableSplitSize) {
+ if (!PropertyUtil.propertyAsBoolean(
+ table.properties(),
+ TableProperties.ADAPTIVE_SPLIT_PLANNING,
+ TableProperties.ADAPTIVE_SPLIT_PLANNING_DEFAULT)) {
+ return Optional.empty();
+ }
+
+ int minParallelism =
+ PropertyUtil.propertyAsInt(
+ table.properties(),
+ TableProperties.SPLIT_MIN_PARALLELISM,
+ TableProperties.SPLIT_MIN_PARALLELISM_DEFAULT);
+
+ Preconditions.checkArgument(minParallelism > 0, "Minimum parallelism must
be a positive value");
+
+ Snapshot snapshot =
+ Stream.of(context.snapshotId(), context.toSnapshotId())
+ .filter(Objects::nonNull)
+ .map(table::snapshot)
+ .findFirst()
+ .orElseGet(table::currentSnapshot);
+
+ if (snapshot == null || snapshot.summary() == null) {
+ return Optional.empty();
+ }
+
+ Map<String, String> summary = snapshot.summary();
+ long totalFiles =
+ PropertyUtil.propertyAsLong(summary,
SnapshotSummary.TOTAL_DATA_FILES_PROP, 0);
+ long totalSize = PropertyUtil.propertyAsLong(summary,
SnapshotSummary.TOTAL_FILE_SIZE_PROP, 0);
+
+ if (totalFiles <= 0 || totalSize <= 0) {
+ return Optional.empty();
+ }
+
+ if (totalFiles > minParallelism && totalSize >= tableSplitSize *
minParallelism) {
+ // If it is possible that splits could normally be calculated to meet the
+ // minimum parallelism, do not adjust the split size
+ return Optional.empty();
+ }
+
+ FileFormat fileFormat =
+ FileFormat.fromString(
+ table
+ .properties()
+ .getOrDefault(
+ TableProperties.DEFAULT_FILE_FORMAT,
+ TableProperties.DEFAULT_FILE_FORMAT_DEFAULT));
+
+ if (!fileFormat.isSplittable()) {
+ return Optional.of(totalSize / totalFiles);
+ }
+
+ long rowGroupSize;
+
+ switch (fileFormat) {
+ case PARQUET:
+ rowGroupSize =
+ PropertyUtil.propertyAsLong(
+ table.properties(),
+ TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES,
+ TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT);
+ break;
+ case ORC:
+ rowGroupSize =
+ PropertyUtil.propertyAsLong(
+ table.properties(),
+ TableProperties.ORC_BLOCK_SIZE_BYTES,
+ TableProperties.ORC_BLOCK_SIZE_BYTES_DEFAULT);
+ break;
+ case AVRO:
+ rowGroupSize = 16 * 1024 * 1024;
+ break;
+ default:
+ rowGroupSize = tableSplitSize;
+ }
+
+ if (totalFiles <= 1) {
+ // For a table with a single small file, default to the smallest of
+ // the configured table split size or the format block size
+ return Optional.of(Math.min(rowGroupSize, tableSplitSize));
+ }
+
+ long minSplitSize = totalSize / minParallelism;
+ // target split size chosen to provide the most parallelism
+ long targetSplitSize = Math.min(minSplitSize, rowGroupSize);
+
+ return Optional.of(targetSplitSize);
Review Comment:
I don't think that's generally going to be the case. Combining still
happens and I'm not clear there's a situation where too many splits will be
produced.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]