tustvold commented on code in PR #4773:
URL: https://github.com/apache/arrow-rs/pull/4773#discussion_r1328487225
##########
parquet_derive/src/parquet_field.rs:
##########
@@ -524,6 +665,21 @@ impl Type {
}
}
+ fn physical_type_as_rust(&self) -> proc_macro2::TokenStream {
+ use parquet::basic::Type as BasicType;
+
+ match self.physical_type() {
+ BasicType::BOOLEAN => quote! { bool },
+ BasicType::INT32 => quote! { i32 },
+ BasicType::INT64 => quote! { i64 },
+ BasicType::INT96 => unimplemented!("96-bit int currently is not
supported"),
+ BasicType::FLOAT => quote! { f32 },
+ BasicType::DOUBLE => quote! { f64 },
+ BasicType::BYTE_ARRAY => quote! { ::parquet::data_type::ByteArray
},
+ BasicType::FIXED_LEN_BYTE_ARRAY => quote! {
::parquet::data_type::ByteArray },
Review Comment:
```suggestion
BasicType::FIXED_LEN_BYTE_ARRAY => quote! {
::parquet::data_type::FixedLenByteArray },
```
##########
parquet_derive/src/parquet_field.rs:
##########
@@ -354,6 +428,50 @@ impl Field {
}
}
+ fn copied_direct_fields(&self) -> proc_macro2::TokenStream {
+ let field_name = &self.ident;
+ let is_a_byte_buf = self.is_a_byte_buf;
+ let is_a_timestamp =
+ self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime);
+ let is_a_date = self.third_party_type ==
Some(ThirdPartyType::ChronoNaiveDate);
+ let is_a_uuid = self.third_party_type == Some(ThirdPartyType::Uuid);
+
+ let value = if is_a_timestamp {
Review Comment:
This might be more clearly written as a match block
##########
parquet_derive/src/parquet_field.rs:
##########
@@ -354,6 +428,50 @@ impl Field {
}
}
+ fn copied_direct_fields(&self) -> proc_macro2::TokenStream {
+ let field_name = &self.ident;
+ let is_a_byte_buf = self.is_a_byte_buf;
+ let is_a_timestamp =
+ self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime);
+ let is_a_date = self.third_party_type ==
Some(ThirdPartyType::ChronoNaiveDate);
+ let is_a_uuid = self.third_party_type == Some(ThirdPartyType::Uuid);
+
+ let value = if is_a_timestamp {
+ quote! {
::chrono::naive::NaiveDateTime::from_timestamp_millis(vals[i]).unwrap() }
+ } else if is_a_date {
+ quote! {
::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i]
+ + ((::chrono::naive::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()
+ - ::chrono::naive::NaiveDate::from_ymd_opt(0, 12,
31).unwrap()).num_days()) as i32).unwrap() }
+ } else if is_a_uuid {
+ quote! {
::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap() }
+ } else if is_a_byte_buf {
+ quote! { vals[i].data().convert() }
+ } else {
+ quote! { vals[i].convert() }
+ };
+
+ // The below code would support references in field types, but it
prevents the compiler
Review Comment:
How would references work, who would own the allocation from which the field
borrows?
##########
parquet/src/record/record_reader.rs:
##########
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::super::errors::ParquetError;
+use super::super::file::reader::RowGroupReader;
+
+pub trait RecordReader<T> {
Review Comment:
Perhaps a doc comment, in particular explaining what T is
##########
parquet_derive/src/parquet_field.rs:
##########
@@ -682,6 +838,66 @@ impl Type {
}
}
+// returns quote of function needed to convert from parquet
+// types back into the types used in the struct fields.
+pub fn get_convertable_quote() -> proc_macro2::TokenStream {
Review Comment:
If I'm reading this correctly this will generate this trait for every record
reader, could we not just add the correct conversion logic to the reader
snippet?
##########
parquet_derive/src/parquet_field.rs:
##########
@@ -354,6 +428,50 @@ impl Field {
}
}
+ fn copied_direct_fields(&self) -> proc_macro2::TokenStream {
+ let field_name = &self.ident;
+ let is_a_byte_buf = self.is_a_byte_buf;
+ let is_a_timestamp =
+ self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime);
+ let is_a_date = self.third_party_type ==
Some(ThirdPartyType::ChronoNaiveDate);
+ let is_a_uuid = self.third_party_type == Some(ThirdPartyType::Uuid);
+
+ let value = if is_a_timestamp {
+ quote! {
::chrono::naive::NaiveDateTime::from_timestamp_millis(vals[i]).unwrap() }
+ } else if is_a_date {
+ quote! {
::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i]
Review Comment:
I think this would be better expressed using signed_duration_since
##########
parquet_derive/src/parquet_field.rs:
##########
@@ -223,6 +223,80 @@ impl Field {
}
}
+ /// Takes the parsed field of the struct and emits a valid
+ /// column reader snippet. Should match exactly what you
+ /// would write by hand.
+ ///
+ /// Can only generate writers for basic structs, for example:
+ ///
+ /// struct Record {
+ /// a_bool: bool
+ /// }
+ ///
+ /// but not
+ ///
+ /// struct UnsupportedNestedRecord {
+ /// a_property: bool,
+ /// nested_record: Record
+ /// }
+ ///
+ /// because this parsing logic is not sophisticated enough for definition
+ /// levels beyond 2.
+ ///
+ /// `Option` types and references not supported
+ pub fn reader_snippet(&self) -> proc_macro2::TokenStream {
+ use parquet::basic::Type as BasicType;
+
+ let ident = &self.ident;
+ let column_reader = self.ty.column_reader();
+ let parquet_type = self.ty.physical_type_as_rust();
+
+ let default_type = match self.ty.physical_type() {
+ BasicType::BYTE_ARRAY | BasicType::FIXED_LEN_BYTE_ARRAY => {
+ quote! { parquet::data_type::ByteArray::new() }
+ }
+ BasicType::BOOLEAN => quote! { false },
+ BasicType::FLOAT | BasicType::DOUBLE => quote! { 0. },
+ BasicType::INT32 | BasicType::INT64 | BasicType::INT96 => quote! {
0 },
+ };
+
+ let write_batch_expr = quote! {
+ let mut vals_vec = Vec::new();
+ vals_vec.resize(max_records, #default_type);
+ let mut vals: &mut [#parquet_type] = vals_vec.as_mut_slice();
+ if let #column_reader(mut typed) = column_reader {
+ typed.read_records(max_records, None, None, vals)?;
+ } else {
+ panic!("Schema and struct disagree on type for {}",
stringify!{#ident});
+ }
+ };
+
+ let vals_writer = match &self.ty {
Review Comment:
I think some comments would help explain what this is doing
##########
parquet_derive/src/parquet_field.rs:
##########
@@ -223,6 +223,80 @@ impl Field {
}
}
+ /// Takes the parsed field of the struct and emits a valid
+ /// column reader snippet. Should match exactly what you
+ /// would write by hand.
+ ///
+ /// Can only generate writers for basic structs, for example:
+ ///
+ /// struct Record {
+ /// a_bool: bool
+ /// }
+ ///
+ /// but not
+ ///
+ /// struct UnsupportedNestedRecord {
+ /// a_property: bool,
+ /// nested_record: Record
+ /// }
+ ///
+ /// because this parsing logic is not sophisticated enough for definition
+ /// levels beyond 2.
+ ///
+ /// `Option` types and references not supported
+ pub fn reader_snippet(&self) -> proc_macro2::TokenStream {
+ use parquet::basic::Type as BasicType;
+
+ let ident = &self.ident;
+ let column_reader = self.ty.column_reader();
+ let parquet_type = self.ty.physical_type_as_rust();
+
+ let default_type = match self.ty.physical_type() {
Review Comment:
Could possibly use `Default::default` here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]