tustvold opened a new issue, #2453:
URL: https://github.com/apache/arrow-datafusion/issues/2453

   **Describe the bug**
   
   ```
   use datafusion::prelude::{ParquetReadOptions, SessionContext};
   
   #[tokio::test]
   async fn temp() {
       let ctx = SessionContext::new();
   
       ctx.register_parquet("patient", 
"part-00000-f6337bce-7fcd-4021-9f9d-040413ea83f8-c000.snappy.parquet",
                            ParquetReadOptions::default()).await.unwrap();
   
       let df = ctx.sql("SELECT patient.meta FROM patient LIMIT 
10").await.unwrap();
       df.show().await.unwrap();
   }
   ```
   
   Where part-00000-f6337bce-7fcd-4021-9f9d-040413ea83f8-c000.snappy.parquet is 
the [parquet 
file](https://github.com/apache/arrow-datafusion/files/8626500/part-00000-f6337bce-7fcd-4021-9f9d-040413ea83f8-c000.snappy.parquet.zip)
 provided by @kesavkolla in 
https://github.com/apache/arrow-datafusion/issues/2439
   
   This fails with
   
   ```
   called `Result::unwrap()` on an `Err` value: 
ArrowError(ExternalError(ArrowError(InvalidArgumentError("column types must 
match schema types, expected Struct([Field { name: \"id\", data_type: Utf8, 
nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { 
name: \"extension\", data_type: List(Field { name: \"element\", data_type: 
Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), 
nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { 
name: \"versionId\", data_type: Utf8, nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }, Field { name: \"lastUpdated\", 
data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }, Field { name: \"source\", data_type: 
Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, 
Field { name: \"profile\", data_type: List(Field { name: \"element\", 
data_type: Utf8, nullable: true, dict_id: 0, dict_is_order
 ed: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: 
false, metadata: None }, Field { name: \"security\", data_type: List(Field { 
name: \"element\", data_type: Struct([Field { name: \"id\", data_type: Utf8, 
nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { 
name: \"extension\", data_type: List(Field { name: \"element\", data_type: 
Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), 
nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { 
name: \"system\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: 
false, metadata: None }, Field { name: \"version\", data_type: Utf8, nullable: 
true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: 
\"code\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, 
metadata: None }, Field { name: \"display\", data_type: Utf8, nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"
 userSelected\", data_type: Boolean, nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }]), nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }, Field { name: \"tag\", data_type: 
List(Field { name: \"element\", data_type: Struct([Field { name: \"id\", 
data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
None }, Field { name: \"extension\", data_type: List(Field { name: \"element\", 
data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, 
Field { name: \"system\", data_type: Utf8, nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }, Field { name: \"version\", data_type: 
Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, 
Field { name: \"code\", data_type: Utf8, nullable: true, dict_id: 0, 
dict_is_ordered: false, meta
 data: None }, Field { name: \"display\", data_type: Utf8, nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: 
\"userSelected\", data_type: Boolean, nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }]), nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: None }]) but found Struct([Field { name: 
\"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, 
metadata: None }]) at column index 0"))))
   ```
   
   The problem arises because ParquetExec is passing the projection indices for 
the arrow schema to get_record_reader_by_columns which instead expects parquet 
column indexes. In the presence of nested types, these are not the same thing.
   
   This is further complicated by 
https://github.com/apache/arrow-rs/issues/1652 and 
https://github.com/apache/arrow-rs/issues/1651
   
   **To Reproduce**
   
   Run the code above
   
   **Expected behavior**
   
   The code should not error


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to