Jefffrey commented on code in PR #20253:
URL: https://github.com/apache/datafusion/pull/20253#discussion_r2787518801


##########
datafusion/common/src/config.rs:
##########


Review Comment:
   Would be good if we can drive-by fix this docstring too (it reads as if its 
a boolean)



##########
datafusion/proto/src/logical_plan/file_formats.rs:
##########
@@ -419,7 +422,7 @@ mod parquet {
                 binary_as_string: global_options.global.binary_as_string,
                 skip_arrow_metadata: global_options.global.skip_arrow_metadata,
                 coerce_int96_opt: 
global_options.global.coerce_int96.map(|compression| {
-                    parquet_options::CoerceInt96Opt::CoerceInt96(compression)
+                    
parquet_options::CoerceInt96Opt::CoerceInt96(compression.to_string())

Review Comment:
   can we fix this name too, since `compression` seems wrong



##########
datafusion/common/src/parquet_config.rs:
##########
@@ -106,3 +106,93 @@ impl From<parquet::file::properties::WriterVersion> for 
DFParquetWriterVersion {
         }
     }
 }
+
+/// Time unit options for Parquet INT96 timestamp coercion.
+///
+/// This enum validates time unit values at configuration time,
+/// ensuring only supported units ("ns", "us", "ms", "s") can be set
+/// via `SET` commands or proto deserialization.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum DFTimeUnit {
+    /// Nanoseconds

Review Comment:
   We can just name the enum values this if we need comments to explain the 
variants



##########
datafusion/common/src/parquet_config.rs:
##########
@@ -106,3 +106,93 @@ impl From<parquet::file::properties::WriterVersion> for 
DFParquetWriterVersion {
         }
     }
 }
+
+/// Time unit options for Parquet INT96 timestamp coercion.
+///
+/// This enum validates time unit values at configuration time,
+/// ensuring only supported units ("ns", "us", "ms", "s") can be set
+/// via `SET` commands or proto deserialization.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum DFTimeUnit {
+    /// Nanoseconds
+    #[default]
+    NS,
+    /// Microseconds
+    US,
+    /// Milliseconds
+    MS,
+    /// Seconds
+    S,
+}
+
+/// Implement parsing strings to `DFTimeUnit`

Review Comment:
   This docstring is probably unnecessary



##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -1428,7 +1430,8 @@ mod tests {
 
         let parquet_exec = scan_format(
             &state,
-            
&ParquetFormat::default().with_coerce_int96(Some("us".to_string())),
+            &ParquetFormat::default()
+                .with_coerce_int96(Some(DFTimeUnit::from_str("us").unwrap())),

Review Comment:
   Probably easier to just pass the `DFTimeUnit::US` here directly instead of 
creating via from_str?



##########
datafusion/common/src/parquet_config.rs:
##########
@@ -106,3 +106,93 @@ impl From<parquet::file::properties::WriterVersion> for 
DFParquetWriterVersion {
         }
     }
 }
+
+/// Time unit options for Parquet INT96 timestamp coercion.
+///
+/// This enum validates time unit values at configuration time,
+/// ensuring only supported units ("ns", "us", "ms", "s") can be set
+/// via `SET` commands or proto deserialization.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum DFTimeUnit {
+    /// Nanoseconds
+    #[default]
+    NS,
+    /// Microseconds
+    US,
+    /// Milliseconds
+    MS,
+    /// Seconds
+    S,
+}
+
+/// Implement parsing strings to `DFTimeUnit`
+impl FromStr for DFTimeUnit {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self> {
+        match s.to_lowercase().as_str() {
+            "ns" => Ok(DFTimeUnit::NS),
+            "us" => Ok(DFTimeUnit::US),
+            "ms" => Ok(DFTimeUnit::MS),
+            "s" => Ok(DFTimeUnit::S),
+            other => Err(DataFusionError::Configuration(format!(
+                "Invalid parquet coerce_int96: {other}. Expected one of: ns, 
us, ms, s"
+            ))),
+        }
+    }
+}
+
+impl Display for DFTimeUnit {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            DFTimeUnit::NS => "ns",
+            DFTimeUnit::US => "us",
+            DFTimeUnit::MS => "ms",
+            DFTimeUnit::S => "s",
+        };
+        write!(f, "{s}")
+    }
+}
+
+impl ConfigField for DFTimeUnit {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) 
{
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = DFTimeUnit::from_str(value)?;
+        Ok(())
+    }
+}
+
+/// Convert `DFTimeUnit` to parquet crate's `arrow::datatypes::TimeUnit`
+///
+/// This conversion is infallible since `DFTimeUnit` only contains
+/// valid values that have been validated at configuration time.
+#[cfg(feature = "parquet")]
+impl From<DFTimeUnit> for arrow::datatypes::TimeUnit {
+    fn from(value: DFTimeUnit) -> Self {
+        match value {
+            DFTimeUnit::NS => arrow::datatypes::TimeUnit::Nanosecond,
+            DFTimeUnit::US => arrow::datatypes::TimeUnit::Microsecond,
+            DFTimeUnit::MS => arrow::datatypes::TimeUnit::Millisecond,
+            DFTimeUnit::S => arrow::datatypes::TimeUnit::Second,
+        }
+    }
+}
+
+/// Convert parquet crate's `arrow::datatypes::TimeUnit` to `DFTimeUnit`
+///
+/// This is used when converting from existing parquet TimeUnit,
+/// such as when reading from proto or test code.
+#[cfg(feature = "parquet")]
+impl From<arrow::datatypes::TimeUnit> for DFTimeUnit {

Review Comment:
   These ones too



##########
datafusion/common/src/config.rs:
##########
@@ -3508,4 +3508,28 @@ mod tests {
             "Invalid or Unsupported Configuration: Invalid parquet writer 
version: 3.0. Expected one of: 1.0, 2.0"
         );
     }
+    #[cfg(feature = "parquet")]
+    #[test]
+    fn test_parquet_coerce_int96_validation() {
+        use crate::{config::ConfigOptions, parquet_config::DFTimeUnit};
+
+        let mut config = ConfigOptions::default();
+
+        // Valid values should work
+        config
+            .set("datafusion.execution.parquet.coerce_int96", "ns")
+            .unwrap();
+        assert_eq!(config.execution.parquet.coerce_int96, 
Some(DFTimeUnit::NS));
+
+        // ... tests for "us", "ms", "s" ...

Review Comment:
   Is this a TODO comment?



##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -1342,18 +1344,18 @@ mod tests {
 
         let time_units_and_expected = vec![
             (
-                None, // Same as "ns" time_unit
+                None, // default: None = "ns"
                 Arc::new(Int64Array::from(vec![
-                    Some(1704141296123456000), // Reads as nanosecond fine 
(note 3 extra 0s)
-                    Some(1704070800000000000), // Reads as nanosecond fine 
(note 3 extra 0s)
-                    Some(-4852191831933722624), // Cannot be represented with 
nanos timestamp (year 9999)

Review Comment:
   Curious as to why these comments are being removed



##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -1342,18 +1344,18 @@ mod tests {
 
         let time_units_and_expected = vec![
             (
-                None, // Same as "ns" time_unit
+                None, // default: None = "ns"

Review Comment:
   If we treat `None` as defaulting to `ns` then perhaps we should remove the 
`Option` and just make it directly a `DFTimeUnit`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to