mapleFU commented on code in PR #37734:
URL: https://github.com/apache/arrow/pull/37734#discussion_r1329171222


##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -799,12 +806,13 @@ Result<std::vector<compute::Expression>> 
ParquetFileFragment::TestRowGroups(
     statistics_expressions_complete_[match[0]] = true;
 
     const SchemaField& schema_field = manifest_->schema_fields[match[0]];
+    auto dest_field = physical_schema_->field(match[0]);

Review Comment:
   No. We cannot.
   
   See `SchemaManifest::Make`:
   
   ```c++
   Status SchemaManifest::Make(const SchemaDescriptor* schema,
                               const std::shared_ptr<const KeyValueMetadata>& 
metadata,
                               const ArrowReaderProperties& properties,
                               SchemaManifest* manifest) {
     SchemaTreeContext ctx;
     ctx.manifest = manifest;
     ctx.properties = properties;
     ctx.schema = schema;
     const GroupNode& schema_node = *schema->group_node();
     manifest->descr = schema;
     manifest->schema_fields.resize(schema_node.field_count());
   
     // Try to deserialize original Arrow schema
     RETURN_NOT_OK(
         GetOriginSchema(metadata, &manifest->schema_metadata, 
&manifest->origin_schema));
     // Ignore original schema if it's not compatible with the Parquet schema
     if (manifest->origin_schema != nullptr &&
         manifest->origin_schema->num_fields() != schema_node.field_count()) {
       manifest->origin_schema = nullptr;
     }
   
     for (int i = 0; i < static_cast<int>(schema_node.field_count()); ++i) {
       SchemaField* out_field = &manifest->schema_fields[i];
       RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), LevelInfo(), &ctx,
                                       /*parent=*/nullptr, out_field));
   
       // TODO(wesm): as follow up to ARROW-3246, we should really pass the 
origin
       // schema (if any) through all functions in the schema reconstruction, 
but
       // I'm being lazy and just setting dictionary fields at the top level for
       // now
       if (manifest->origin_schema == nullptr) {
         continue;
       }
   
       auto& origin_field = manifest->origin_schema->field(i);
       RETURN_NOT_OK(ApplyOriginalMetadata(*origin_field, out_field));
     }
     return Status::OK();
   }
   ```
   
   
   Firstly, schema will be deduced by `GetTypeForNode`:
   
   ```c++
   // ----------------------------------------------------------------------
   // Schema logic
   
   ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
       int column_index, const schema::PrimitiveNode& primitive_node,
       SchemaTreeContext* ctx) {
     ASSIGN_OR_RAISE(
         std::shared_ptr<ArrowType> storage_type,
         GetArrowType(primitive_node, 
ctx->properties.coerce_int96_timestamp_unit()));
     if (ctx->properties.read_dictionary(column_index) &&
         IsDictionaryReadSupported(*storage_type)) {
       return ::arrow::dictionary(::arrow::int32(), storage_type);
     }
     return storage_type;
   }
   ```
   
   Secondly, it will has a `origin_schema`.
   
   So.
   
   1. `origin_schema` is the arrow-written schema
   2. `field` only has deduced schema
   
   we use deduced schema here, which cause the problem
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to