bkietz commented on code in PR #37734:
URL: https://github.com/apache/arrow/pull/37734#discussion_r1327514177


##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -799,12 +806,13 @@ Result<std::vector<compute::Expression>> 
ParquetFileFragment::TestRowGroups(
     statistics_expressions_complete_[match[0]] = true;
 
     const SchemaField& schema_field = manifest_->schema_fields[match[0]];
+    auto dest_field = physical_schema_->field(match[0]);

Review Comment:
   Under what circumstances is the physical schema field different from the 
manifest's? It looks like they are assembled using the same code path and it 
seems we could even assert equality here
   ```suggestion
       auto dest_field = physical_schema_->field(match[0]);
       DCHECK(schema_field.field->Equals(physical_schema_->field(match[0])));
   ```



##########
cpp/src/arrow/dataset/file_parquet_test.cc:
##########
@@ -725,10 +729,31 @@ TEST(TestParquetStatistics, NullMax) {
   auto reader =
       parquet::ParquetFileReader::OpenFile(dir_string + 
"/nan_in_stats.parquet");
   auto statistics = 
reader->RowGroup(0)->metadata()->ColumnChunk(0)->statistics();
-  auto stat_expression =
-      ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics);
+  auto stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(
+      *field, *statistics, field->type());
   EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
 }
 
+TEST(TestParquetStatistics, SchemaCast) {
+  auto arrow_src_field = ::arrow::field("t", duration(TimeUnit::NANO));
+  auto table = TableFromJSON(schema({arrow_src_field}), {
+                                                            R"([{"t": 1}])",
+                                                        });
+  TableBatchReader table_reader(*table);
+  ArrowWriterProperties::Builder builder;
+  builder.store_schema();
+  ASSERT_OK_AND_ASSIGN(auto buffer,
+                       ParquetFormatHelper::Write(&table_reader, 
builder.build()));
+  std::shared_ptr<io::BufferReader> buffer_reader =
+      std::make_shared<io::BufferReader>(buffer);
+  auto reader = parquet::ParquetFileReader::Open(
+      buffer_reader, parquet::default_reader_properties(), nullptr);
+  auto statistics = 
reader->RowGroup(0)->metadata()->ColumnChunk(0)->statistics();
+  auto manifest_reduce_field = ::arrow::field("t", int64());
+  auto stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(
+      *manifest_reduce_field, *statistics, arrow_src_field->type());
+  EXPECT_EQ(stat_expression->ToString(), "(t == 1)");

Review Comment:
   This test doesn't appear to assert correct data type



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -112,11 +112,16 @@ bool IsNan(const Scalar& value) {
 }
 
 std::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
-    const SchemaField& schema_field, const parquet::RowGroupMetaData& 
metadata) {
+    const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata,
+    const std::shared_ptr<arrow::DataType>& dest_type) {
   // For the remaining of this function, failure to extract/parse statistics
   // are ignored by returning nullptr. The goal is two fold. First

Review Comment:
   ```suggestion
     // Failure to extract/parse statistics is indicated by returning
     // nullopt. The goal is two fold. First
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to