Jefffrey commented on code in PR #22702:
URL: https://github.com/apache/datafusion/pull/22702#discussion_r3354679076
##########
testing:
##########
Review Comment:
need to fix submodules here?
##########
datafusion/core/src/dataframe/mod.rs:
##########
@@ -2471,6 +2469,65 @@ impl DataFrame {
&self,
value: ScalarValue,
columns: Vec<String>,
+ ) -> Result<DataFrame> {
+ self.fill_columns(value, &columns, coalesce(), |_| true)
+ }
+
+ // Helper to find columns from names
+ fn find_columns(&self, names: &[impl AsRef<str>]) -> Result<Vec<FieldRef>>
{
+ let schema = self.logical_plan().schema();
+ names
+ .iter()
+ .map(|name| {
+ let name = name.as_ref();
+ schema
+ .field_with_name(None, name)
+ .cloned()
+ .map_err(|_| plan_datafusion_err!("Column '{}' not found",
name))
+ })
+ .collect()
+ }
+
+ /// Fill NaN values in specified floating-point columns with a given value
+ /// If no columns are specified (empty slice), applies to all columns
+ /// Only floating-point columns are affected; other columns are left
unchanged
+ /// Only fills if the value can be cast to the column's type
+ ///
+ /// # Arguments
+ /// * `value` - Value to fill NaNs with
+ /// * `columns` - List of column names to fill. If empty, fills all
columns.
+ ///
+ /// # Example
+ /// ```
+ /// # use datafusion::prelude::*;
+ /// # use datafusion::error::Result;
+ /// # use datafusion_common::ScalarValue;
+ /// # #[tokio::main]
+ /// # async fn main() -> Result<()> {
+ /// let ctx = SessionContext::new();
+ /// let df = ctx
+ /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
+ /// .await?;
+ /// // Fill NaN in only columns "a" and "c":
+ /// let df = df.fill_nan(ScalarValue::from(0.0), &["a", "c"])?;
+ /// // Fill NaN across all columns:
+ /// let df = df.fill_nan(ScalarValue::from(0.0), &[])?;
+ /// # Ok(())
+ /// # }
+ /// ```
+ pub fn fill_nan(&self, value: ScalarValue, columns: &[&str]) ->
Result<DataFrame> {
+ self.fill_columns(value, columns, nanvl(), |field| {
+ field.data_type().is_floating()
+ })
+ }
+
+ #[expect(clippy::needless_pass_by_value)]
Review Comment:
consider fixing this lint since it's a private function now
##########
datafusion/core/src/dataframe/mod.rs:
##########
@@ -2527,6 +2528,78 @@ impl DataFrame {
.collect()
}
+ /// Fill NaN values in specified columns with a given value
+ /// If no columns are specified (empty vector), applies to all columns
+ /// Only fills if the value can be cast to the column's type
+ ///
+ /// # Arguments
+ /// * `value` - Value to fill NaNs with
+ /// * `columns` - List of column names to fill. If empty, fills all
columns.
+ ///
+ /// # Example
+ /// ```
+ /// # use datafusion::prelude::*;
+ /// # use datafusion::error::Result;
+ /// # use datafusion_common::ScalarValue;
+ /// # #[tokio::main]
+ /// # async fn main() -> Result<()> {
+ /// let ctx = SessionContext::new();
+ /// let df = ctx
+ /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
+ /// .await?;
+ /// // Fill NaN in only columns "a" and "c":
+ /// let df = df.fill_nan(ScalarValue::from(0.0), vec!["a".to_owned(),
"c".to_owned()])?;
+ /// // Fill NaN across all columns:
+ /// let df = df.fill_nan(ScalarValue::from(0.0), vec![])?;
+ /// # Ok(())
+ /// # }
+ /// ```
+ #[expect(clippy::needless_pass_by_value)]
+ pub fn fill_nan(
+ &self,
+ value: ScalarValue,
+ columns: Vec<String>,
+ ) -> Result<DataFrame> {
Review Comment:
this works, thanks
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]