andygrove commented on code in PR #3247:
URL: https://github.com/apache/datafusion-comet/pull/3247#discussion_r2719001036
##########
spark/src/main/scala/org/apache/comet/serde/operator/CometIcebergNativeScan.scala:
##########
@@ -71,14 +71,138 @@ object CometIcebergNativeScan extends
CometOperatorSerde[CometBatchScanExec] wit
}
/**
- * Converts an Iceberg partition value to JSON format expected by
iceberg-rust.
- *
- * iceberg-rust's Literal::try_from_json() expects specific formats for
certain types:
- * - Timestamps: ISO string format "yyyy-MM-dd'T'HH:mm:ss.SSSSSS"
- * - Dates: ISO string format "YYYY-MM-DD"
- * - Decimals: String representation
- *
- * See: iceberg-rust/crates/iceberg/src/spec/values/literal.rs
+ * Converts an Iceberg partition value to protobuf format. Protobuf is less
verbose than JSON.
+ * The following types are also serialized as integer values instead of as
strings - Timestamps,
+ * Dates, Decimals, FieldIDs
+ */
+ private def partitionValueToProto(
+ fieldId: Int,
+ fieldTypeStr: String,
+ value: Any): OperatorOuterClass.PartitionValue = {
+ val builder = OperatorOuterClass.PartitionValue.newBuilder()
+ builder.setFieldId(fieldId)
+
+ if (value == null) {
+ builder.setIsNull(true)
+ } else {
+ builder.setIsNull(false)
+ fieldTypeStr match {
+ case t if t.startsWith("timestamp") =>
+ val micros = value match {
+ case l: java.lang.Long => l.longValue()
+ case i: java.lang.Integer => i.longValue()
+ case _ => value.toString.toLong
+ }
+ if (t.contains("tz")) {
+ builder.setTimestampTzVal(micros)
+ } else {
+ builder.setTimestampVal(micros)
+ }
+
+ case "date" =>
+ val days = value.asInstanceOf[java.lang.Integer].intValue()
+ builder.setDateVal(days)
+
+ case d if d.startsWith("decimal(") =>
+ // Serialize as unscaled BigInteger bytes
+ val bigDecimal = value match {
+ case bd: java.math.BigDecimal => bd
+ case _ => new java.math.BigDecimal(value.toString)
+ }
+ val unscaledBytes = bigDecimal.unscaledValue().toByteArray
+
builder.setDecimalVal(com.google.protobuf.ByteString.copyFrom(unscaledBytes))
+
+ case "string" =>
+ builder.setStringVal(value.toString)
+
+ case "int" =>
+ val intVal = value match {
+ case i: java.lang.Integer => i.intValue()
+ case l: java.lang.Long => l.intValue()
+ case _ => value.toString.toInt
+ }
+ builder.setIntVal(intVal)
+
+ case "long" =>
+ val longVal = value match {
+ case l: java.lang.Long => l.longValue()
+ case i: java.lang.Integer => i.longValue()
+ case _ => value.toString.toLong
+ }
+ builder.setLongVal(longVal)
+
+ case "float" =>
+ val floatVal = value match {
+ case f: java.lang.Float => f.floatValue()
+ case d: java.lang.Double => d.floatValue()
+ case _ => value.toString.toFloat
+ }
+ builder.setFloatVal(floatVal)
+
+ case "double" =>
+ val doubleVal = value match {
+ case d: java.lang.Double => d.doubleValue()
+ case f: java.lang.Float => f.doubleValue()
+ case _ => value.toString.toDouble
+ }
+ builder.setDoubleVal(doubleVal)
+
+ case "boolean" =>
+ val boolVal = value match {
+ case b: java.lang.Boolean => b.booleanValue()
+ case _ => value.toString.toBoolean
+ }
+ builder.setBoolVal(boolVal)
+
+ case "uuid" =>
+ // UUID as bytes (16 bytes) or string
+ val uuidBytes = value match {
+ case uuid: java.util.UUID =>
+ val bb = java.nio.ByteBuffer.wrap(new Array[Byte](16))
+ bb.putLong(uuid.getMostSignificantBits)
+ bb.putLong(uuid.getLeastSignificantBits)
+ bb.array()
+ case _ =>
+ // Parse UUID string and convert to bytes
+ val uuid = java.util.UUID.fromString(value.toString)
+ val bb = java.nio.ByteBuffer.wrap(new Array[Byte](16))
+ bb.putLong(uuid.getMostSignificantBits)
+ bb.putLong(uuid.getLeastSignificantBits)
+ bb.array()
+ }
+
builder.setUuidVal(com.google.protobuf.ByteString.copyFrom(uuidBytes))
+
+ case t if t.startsWith("fixed[") || t.startsWith("binary") =>
+ val bytes = value match {
+ case bytes: Array[Byte] => bytes
+ case _ => value.toString.getBytes("UTF-8")
+ }
+ if (t.startsWith("fixed")) {
+ builder.setFixedVal(com.google.protobuf.ByteString.copyFrom(bytes))
+ } else {
+
builder.setBinaryVal(com.google.protobuf.ByteString.copyFrom(bytes))
+ }
+
+ // Fallback: infer type from Java type ?
+ case _ =>
+ value match {
+ case s: String => builder.setStringVal(s)
+ case i: java.lang.Integer => builder.setIntVal(i.intValue())
+ case l: java.lang.Long => builder.setLongVal(l.longValue())
+ case d: java.lang.Double => builder.setDoubleVal(d.doubleValue())
+ case f: java.lang.Float => builder.setFloatVal(f.floatValue())
+ case b: java.lang.Boolean => builder.setBoolVal(b.booleanValue())
+ case other => builder.setStringVal(other.toString)
+ }
+ }
+ }
+
+ builder.build()
+ }
+
+ /**
+ * Legacy JSON serialization function - removed in favor of protobuf. Kept
as reference for
+ * conversion logic.
*/
private def partitionValueToJson(fieldTypeStr: String, value: Any): JValue =
{
Review Comment:
can we just remove this now?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]