alamb commented on code in PR #9613:
URL: https://github.com/apache/arrow-datafusion/pull/9613#discussion_r1529129103
##########
datafusion-examples/examples/deserialize_to_struct.rs:
##########
@@ -15,61 +15,61 @@
// specific language governing permissions and limitations
// under the License.
+use arrow::array::AsArray;
+use arrow::datatypes::{Float64Type, Int32Type};
use datafusion::error::Result;
use datafusion::prelude::*;
-use serde::Deserialize;
+use futures::StreamExt;
/// This example shows that it is possible to convert query results into Rust
structs .
-/// It will collect the query results into RecordBatch, then convert it to
serde_json::Value.
-/// Then, serde_json::Value is turned into Rust's struct.
-/// Any datatype with `Deserialize` implemeneted works.
#[tokio::main]
async fn main() -> Result<()> {
let data_list = Data::new().await?;
println!("{data_list:#?}");
Ok(())
}
-#[derive(Deserialize, Debug)]
+#[derive(Debug)]
struct Data {
#[allow(dead_code)]
- int_col: i64,
+ int_col: i32,
#[allow(dead_code)]
double_col: f64,
}
impl Data {
pub async fn new() -> Result<Vec<Self>> {
// this group is almost the same as the one you find it in
parquet_sql.rs
- let batches = {
- let ctx = SessionContext::new();
+ let ctx = SessionContext::new();
- let testdata = datafusion::test_util::parquet_test_data();
+ let testdata = datafusion::test_util::parquet_test_data();
- ctx.register_parquet(
- "alltypes_plain",
- &format!("{testdata}/alltypes_plain.parquet"),
- ParquetReadOptions::default(),
- )
- .await?;
+ ctx.register_parquet(
+ "alltypes_plain",
+ &format!("{testdata}/alltypes_plain.parquet"),
+ ParquetReadOptions::default(),
+ )
+ .await?;
- let df = ctx
- .sql("SELECT int_col, double_col FROM alltypes_plain")
- .await?;
+ let df = ctx
+ .sql("SELECT int_col, double_col FROM alltypes_plain")
+ .await?;
- df.clone().show().await?;
+ df.clone().show().await?;
- df.collect().await?
- };
- let batches: Vec<_> = batches.iter().collect();
+ let mut stream = df.execute_stream().await?;
+ let mut list = vec![];
+ while let Some(b) = stream.next().await.transpose()? {
+ let int_col = b.column(0).as_primitive::<Int32Type>();
+ let float_col = b.column(1).as_primitive::<Float64Type>();
- // converts it to serde_json type and then convert that into Rust type
Review Comment:
> I'd dispute that we ever really had a way to do this, going via
serde_json::Value is more of a hack than anything else. Serializing to a JSON
string and back will likely be faster
The key thing in my mind is to make it easy / quick for new users to get
something working quickly. I am well aware that custom array -> struct will be
the fastest performance, but I think it takes non trivial expertise in
manipulating the arrow-rs API (especially when it comes to StructArray and
ListArray) -- so offering them a fast way to get started with a slower API is
important I think
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]