[
https://issues.apache.org/jira/browse/ARROW-18140?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Alessandro Molina updated ARROW-18140:
--------------------------------------
Component/s: C++
> The metadata info will lost in parquet file schema after writing the parquet
> file by calling the FileSystemDataset::Write() method.
> -----------------------------------------------------------------------------------------------------------------------------------
>
> Key: ARROW-18140
> URL: https://issues.apache.org/jira/browse/ARROW-18140
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++
> Reporter: Ke Jia
> Priority: Major
>
> This issue can be reproduced by the following code.
> auto format = std::make_shared<ParquetFileFormat>();
> auto fs = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime);
> FileSystemDatasetWriteOptions write_options;
> write_options.file_write_options = format->DefaultWriteOptions();
> write_options.filesystem = fs;
> write_options.base_dir = "root";
> write_options.partitioning = std::make_shared<HivePartitioning>(schema({}));
> write_options.basename_template = "\{i}.parquet";
> auto metadata =
> std::shared_ptr<KeyValueMetadata>(new KeyValueMetadata(\{"foo"},
> \{"bar"}));
> auto dataset_schema = schema(\{field("a", int64())}, metadata);
> RecordBatchVector batches{
> ConstantArrayGenerator::Zeroes(kRowsPerBatch, dataset_schema)};
> ASSERT_EQ(0, batches[0]->column(0)->null_count());
> auto dataset = std::make_shared<InMemoryDataset>(dataset_schema, batches);
> ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
> ASSERT_OK(scanner_builder->Project(
> \{compute::call("add", {compute::field_ref("a"), compute::literal(1)})},
> \{"a_plus_one"}));
> ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
> // Before write the schema has the metadata info.
> ASSERT_EQ(1, dataset_schema->HasMetadata());
> ASSERT_OK(FileSystemDataset::Write(write_options, scanner));
> ASSERT_OK_AND_ASSIGN(auto dataset_factory, FileSystemDatasetFactory::Make(
> fs, \{"root/0.parquet"},
> format, {}));
> ASSERT_OK_AND_ASSIGN(auto written_dataset,
> dataset_factory->Finish(FinishOptions{}));
> // After write the schema does not has the metadata info.
> ASSERT_EQ(0, written_dataset->schema()->HasMetadata());
--
This message was sent by Atlassian Jira
(v8.20.10#820010)