alamb opened a new issue, #9672: URL: https://github.com/apache/arrow-datafusion/issues/9672
### Is your feature request related to a problem or challenge? This comes from a Discord discussion: https://discord.com/channels/885562378132000778/1166447479609376850/1218573269662437477 Hello. I'm trying to add brand new string column to existing dataframe. How it can be done in idiomatic way? I have this dataframe as input: ```rust +----+------+ | id | data | +----+------+ | 1 | 42 | | 2 | 43 | | 3 | 44 | +----+------+ ``` I want to get similar to this output (it's from polars): ```rust ┌─────┬──────┬─────────┐ │ id ┆ data ┆ new_col │ │ --- ┆ --- ┆ --- │ │ i32 ┆ i32 ┆ str │ ╞═════╪══════╪═════════╡ │ 1 ┆ 42 ┆ foo │ │ 2 ┆ 43 ┆ bar │ │ 3 ┆ 44 ┆ baz │ └─────┴──────┴─────────┘ ``` Code example: ```rust // in polars I can do like this: let mut df = df!( "id" => &[1, 2, 3], "data" => &[42, 43, 44], )?; let new_col = vec!["foo", "bar", "baz"]; let s = Series::new("new_col", new_col); let df = df.with_column(s)?; println!("{:?}", df); // don't understand how to do the same with datafusion let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), Field::new("data", DataType::Int32, true), ])); let batch = RecordBatch::try_new( schema.clone(), vec![ Arc::new(Int32Array::from(vec![1, 2, 3])), Arc::new(Int32Array::from(vec![42, 43, 44])), ], )?; let ctx = SessionContext::new(); ctx.register_batch("t", batch)?; let df = ctx.table("t").await?; let data = vec!["foo", "bar", "baz"]; // mismatched type expected struct `GenericListArray<i32>` found struct `Vec<&str>` let res = df.with_column("new_col", Expr::Literal(ScalarValue::List(Arc::new(data))))?; ``` ### Describe the solution you'd like Add a `DataFrame::with_column` that does the same as https://docs.rs/polars/latest/polars/frame/struct.DataFrame.html#method.with_column ### Describe alternatives you've considered The user suggests: https://discord.com/channels/885562378132000778/1166447479609376850/1218908027609157702 The only way I've found that works is to create new dataframe with required column and join them. Please, let me know if I'm missing something? ```rust let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), Field::new("data", DataType::Int32, true), ])); let batch1 = RecordBatch::try_new( schema.clone(), vec![ Arc::new(Int32Array::from(vec![1, 2, 3])), Arc::new(Int32Array::from(vec![42, 43, 44])), ], )?; let ctx = SessionContext::new(); ctx.register_batch("t1", batch1.clone())?; let new_col = vec!["foo", "bar", "baz"]; let ids = schema.field_with_name("id")?.to_owned(); let ids_data = batch1.column_by_name("id").unwrap().clone(); let schema = Arc::new(Schema::new(vec![ ids, Field::new("new_col", DataType::Utf8, true), ])); let batch2 = RecordBatch::try_new( schema, vec![ ids_data, Arc::new(StringArray::from(new_col)), ] )?; ctx.register_batch("t2", batch2)?; let res = ctx .sql("select t1.id, t1.data, t2.new_col \ from t1 \ inner join t2 on t1.id = t2.id").await?; res.show().await?; ``` @Omega359 suggests https://discord.com/channels/885562378132000778/1166447479609376850/1218943627443830945 I would think that using the new unnest function (not yet in a released version that I'm aware of) would work ... however when I tried it I get an error ``` let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), Field::new("data", DataType::Int32, true), ])); let batch = RecordBatch::try_new( schema.clone(), vec![ Arc::new(Int32Array::from(vec![1, 2, 3])), Arc::new(Int32Array::from(vec![42, 43, 44])), ], )?; let ctx = SessionContext::new(); ctx.register_batch("t", batch)?; let df = ctx.table("t").await?; let data = ["foo", "bar", "baz"]; let expr = make_array(data.iter().map(|&d| lit(d)).collect()); let res = df.with_column("new_col", Expr::Unnest(Unnest { exprs: vec![expr] }))?; res.show().await?; ``` Error: Context("type_coercion", Internal("Unnest should be rewritten to LogicalPlan::Unnest before type coercion")) ### Additional context _No response_ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
