github-actions[bot] commented on code in PR #65135:
URL: https://github.com/apache/doris/pull/65135#discussion_r3510140252
##########
fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java:
##########
@@ -896,14 +954,66 @@ private Split createIcebergSplit(FileScanTask
fileScanTask) {
return split;
}
- private Split createIcebergSysSplit(FileScanTask fileScanTask) {
- long rowCount = fileScanTask.file() == null ? 1 :
fileScanTask.file().recordCount();
+ private Split createIcebergSysSplit(ScanTask scanTask) {
+ long rowCount = Math.max(scanTask.estimatedRowsCount(), 1L);
+ if (scanTask.isFileScanTask() && scanTask.asFileScanTask().file() !=
null) {
+ rowCount =
Math.max(scanTask.asFileScanTask().file().recordCount(), 1L);
+ }
IcebergSplit split = IcebergSplit.newSysTableSplit(
- SerializationUtil.serializeToBase64(fileScanTask), rowCount);
+ SerializationUtil.serializeToBase64(scanTask), rowCount);
split.setTableFormatType(TableFormatType.ICEBERG);
return split;
}
+ private Split createIcebergPositionDeleteSysSplit(PositionDeletesScanTask
task) {
+ DeleteFile deleteFile = task.file();
+ String originalPath = deleteFile.path().toString();
+ LocationPath locationPath = createLocationPathWithCache(originalPath);
+ IcebergSplit split = IcebergSplit.newPositionDeleteSysTableSplit(
+ locationPath, task.start(), task.length(),
deleteFile.fileSizeInBytes(),
+ storagePropertiesMap, originalPath);
+ split.setTableFormatType(TableFormatType.ICEBERG_POSITION_DELETES);
+
split.setPositionDeleteFileFormat(getNativePositionDeleteFileFormat(deleteFile.format()));
+ split.setPositionDeleteOriginalPath(originalPath);
+ if (deleteFile.format() == FileFormat.PUFFIN) {
+
split.setPositionDeleteContent(IcebergDeleteFileFilter.DeletionVector.type());
+
split.setPositionDeleteReferencedDataFilePath(deleteFile.referencedDataFile());
+ split.setPositionDeleteContentOffset(deleteFile.contentOffset());
+
split.setPositionDeleteContentSizeInBytes(deleteFile.contentSizeInBytes());
+ } else {
+
split.setPositionDeleteContent(IcebergDeleteFileFilter.PositionDelete.type());
+ }
+
+ split.setPartitionSpecId(deleteFile.specId());
+ PartitionSpec partitionSpec =
icebergTable.specs().get(deleteFile.specId());
+ if (partitionSpec != null && partitionSpec.isPartitioned() &&
deleteFile.partition() != null) {
+ split.setPartitionDataJson(getPartitionDataObjectJson(
+ (PartitionData) deleteFile.partition(), partitionSpec));
+ }
+ return split;
+ }
+
+ private TFileFormatType getNativePositionDeleteFileFormat(FileFormat
fileFormat) {
+ if (fileFormat == FileFormat.PARQUET || fileFormat ==
FileFormat.PUFFIN) {
+ return TFileFormatType.FORMAT_PARQUET;
+ } else if (fileFormat == FileFormat.ORC) {
+ return TFileFormatType.FORMAT_ORC;
+ }
+ throw new UnsupportedOperationException(
+ "Unsupported Iceberg position delete file format: " +
fileFormat);
+ }
+
+ private String getPartitionDataObjectJson(PartitionData partitionData,
PartitionSpec partitionSpec) {
+ List<String> partitionValues = IcebergUtils.getPartitionValues(
+ partitionData, partitionSpec, sessionVariable.getTimeZone());
+ Map<String, String> partitionJson = new LinkedHashMap<>();
+ List<PartitionField> fields = partitionSpec.fields();
+ for (int i = 0; i < fields.size(); i++) {
+ partitionJson.put(fields.get(i).name(), partitionValues.get(i));
Review Comment:
The partition payload here loses the actual JSON types.
`getPartitionValues()` returns strings for every Iceberg partition value, and
`Gson` serializes this `Map<String, String>` as e.g. `{"id":"1"}` for an
integer partition. On BE, `_append_partition_column()` deserializes that object
straight into the `partition` struct column; the struct serde forwards `"1"` to
the numeric field serde, which does not strip JSON quotes, and nullable struct
fields then turn the parse failure into NULL. So a table partitioned by an
integer/date field will expose wrong `partition` values in `$position_deletes`
even though the new tests only cover string `dt` partitions. Please preserve
typed JSON values here, and add a regression with at least one non-string
partition column.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]