yihua commented on code in PR #18744:
URL: https://github.com/apache/hudi/pull/18744#discussion_r3270752701
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/BatchedBlobReader.scala:
##########
@@ -208,16 +208,31 @@ class BatchedBlobReader(
// Dispatch based on storage_type (field 0)
val storageType = accessor.getString(blobStruct, 0)
if (storageType == HoodieSchema.Blob.INLINE) {
- // Case 1: Inline — bytes are in field 1
- val bytes = accessor.getBytes(blobStruct, 1)
- batch += RowInfo[R](
- originalRow = row,
- filePath = "",
- offset = -1,
- length = -1,
- index = rowIndex,
- inlineBytes = Some(bytes)
- )
+ // INLINE + CONTENT: inline_data is populated; return bytes
directly (1-hop).
+ // INLINE + DESCRIPTOR: inline_data is null and the scan
synthesized a
+ // reference pointing into the backing file's storage layout. We
refuse to
+ // materialize bytes here — DESCRIPTOR is a metadata-only mode
for INLINE
+ // rows, and the synthesized reference is an internal pointer,
not
+ // user-facing storage info. Callers must switch to CONTENT mode
or stop
+ // using read_blob() on INLINE columns under DESCRIPTOR.
+ if (!accessor.isNullAt(blobStruct, 1)) {
+ val bytes = accessor.getBytes(blobStruct, 1)
+ batch += RowInfo[R](
+ originalRow = row,
+ filePath = "",
+ offset = -1,
+ length = -1,
+ index = rowIndex,
+ inlineBytes = Some(bytes)
+ )
+ } else {
+ throw new IllegalStateException(
+ s"read_blob() cannot materialize bytes for an INLINE blob
under " +
+ s"DESCRIPTOR mode (row $rowIndex). Under " +
Review Comment:
meaning that `read_blob()` should always read the blob content regardless of
the config.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]