That sounds potentially Spark-related to me. Feel free to open a JIRA
issue in Apache Spark if it is happening in Spark. If you can see the
schema metadata in Arrow then the issue is probably there

On Thu, Oct 10, 2019 at 5:11 PM Isaac Myers
<isaacmy...@protonmail.com.invalid> wrote:
>
> Thanks for the quick response. When I use pyspark to read a parquet file 
> written by arrow, I can't see even file-level metadata. Is that also a known 
> issue? (Note: I searched the JIRA issues and couldn't find any info.)
>
>
> Sent with ProtonMail Secure Email.
>
> ‐‐‐‐‐‐‐ Original Message ‐‐‐‐‐‐‐
> On Thursday, October 10, 2019 12:44 PM, Wes McKinney <wesmck...@gmail.com> 
> wrote:
>
> > We haven't implemented storing field-level metadata in Parquet files
> > yet. It's somewhat tricky. See
> > https://issues.apache.org/jira/browse/ARROW-4359
> >
> > On Thu, Oct 10, 2019 at 11:51 AM Isaac Myers
> > isaacmy...@protonmail.com.invalid wrote:
> >
> > > I can write both field- and schema-level metadata and read the values 
> > > back from schema or relevant field. I write the schema and table 
> > > described by the schema to a local parquet file. Upon reading the table 
> > > or schema from the parquet file, only schema metadata are present and 
> > > field metadata are not present. Am I doing something wrong? Please view 
> > > the minimum working example below:
> > > <code>
> > > #include <vector>
> > > #include <cstdint>
> > > #include <map>
> > > #include <arrow/api.h>
> > > #include <arrow/io/api.h>
> > > #include <parquet/arrow/reader.h>
> > > #include <parquet/arrow/writer.h>
> > > #include <parquet/arrow/schema.h>
> > > //#include <arrow/>
> > > int main(int argc, char* argv[])
> > > {
> > > /*********************************
> > > Create Parquet File
> > > *********************************/
> > > arrow::Status st;
> > > arrow::MemoryPool pool = arrow::default_memory_pool();// Create Schema 
> > > and fields with metadata
> > > std::vector<std::shared_ptrarrow::Field> fields;
> > > std::unordered_map<std::string, std::string> a_keyval;
> > > a_keyval["unit"] = "sec";
> > > a_keyval["note"] = "not the standard millisecond unit";
> > > arrow::KeyValueMetadata a_md(a_keyval);
> > > std::shared_ptrarrow::Field a_field = arrow::field("a", arrow::int16(), 
> > > false, a_md.Copy());
> > > fields.push_back(a_field);
> > > std::unordered_map<std::string, std::string> b_keyval;
> > > b_keyval["unit"] = "ft";
> > > arrow::KeyValueMetadata b_md(b_keyval);
> > > std::shared_ptrarrow::Field b_field = arrow::field("b", arrow::int16(), 
> > > false, b_md.Copy());
> > > fields.push_back(b_field);
> > > std::shared_ptrarrow::Schema schema = arrow::schema(fields);
> > > // Add metadata to schema.
> > > std::unordered_map<std::string, std::string> schema_keyval;
> > > schema_keyval["classification"] = "Type 0";
> > > arrow::KeyValueMetadata schema_md(schema_keyval);
> > > schema = schema->AddMetadata(schema_md.Copy());
> > > // Build arrays of data and add to Table.
> > > const int64_t rowgroup_size = 100;
> > > std::vector<int16_t> a_data(rowgroup_size, 0);
> > > std::vector<int16_t> b_data(rowgroup_size, 0);
> > > for (int16_t i = 0; i < rowgroup_size; i++)
> > > {
> > > a_data[i] = i;
> > > b_data[i] = rowgroup_size - i;
> > > }
> > > arrow::Int16Builder a_bldr(pool);
> > > arrow::Int16Builder b_bldr(pool);
> > > st = a_bldr.Resize(rowgroup_size);
> > > if (!st.ok()) return 1;
> > > st = b_bldr.Resize(rowgroup_size);
> > > if (!st.ok()) return 1;
> > > st = a_bldr.AppendValues(a_data);
> > > if (!st.ok()) return 1;
> > > st = b_bldr.AppendValues(b_data);
> > > if (!st.ok()) return 1;
> > > std::shared_ptrarrow::Array a_arr_ptr;
> > > std::shared_ptrarrow::Array b_arr_ptr;
> > > arrow::ArrayVector arr_vec;
> > > st = a_bldr.Finish(&a_arr_ptr);
> > > if (!st.ok()) return 1;
> > > arr_vec.push_back(a_arr_ptr);
> > > st = b_bldr.Finish(&b_arr_ptr);
> > > if (!st.ok()) return 1;
> > > arr_vec.push_back(b_arr_ptr);
> > > std::shared_ptrarrow::Table table = arrow::Table::Make(schema, arr_vec);
> > > // Test metadata
> > > printf("\nMetadata from original schema:\n");
> > > printf("%s\n", schema->metadata()->ToString().c_str());
> > > printf("%s\n", schema->field(0)->metadata()->ToString().c_str());
> > > printf("%s\n", schema->field(1)->metadata()->ToString().c_str());
> > > std::shared_ptrarrow::Schema table_schema = table->schema();
> > > printf("\nMetadata from schema retrieved from table (should be the 
> > > same):\n");
> > > printf("%s\n", table_schema->metadata()->ToString().c_str());
> > > printf("%s\n", table_schema->field(0)->metadata()->ToString().c_str());
> > > printf("%s\n", table_schema->field(1)->metadata()->ToString().c_str());
> > > // Open file and write table.
> > > std::string file_name = "test.parquet";
> > > std::shared_ptrarrow::io::FileOutputStream ostream;
> > > st = arrow::io::FileOutputStream::Open(file_name, &ostream);
> > > if (!st.ok()) return 1;
> > > std::unique_ptrparquet::arrow::FileWriter writer;
> > > std::shared_ptrparquet::WriterProperties props = 
> > > parquet::default_writer_properties();
> > > st = parquet::arrow::FileWriter::Open(*schema, pool, ostream, props, 
> > > &writer);
> > > if (!st.ok()) return 1;
> > > st = writer->WriteTable(*table, rowgroup_size);
> > > if (!st.ok()) return 1;
> > > // Close file and stream.
> > > st = writer->Close();
> > > if (!st.ok()) return 1;
> > > st = ostream->Close();
> > > if (!st.ok()) return 1;
> > > /*********************************
> > > Read Parquet File
> > > **********************************/
> > > // Create new memory pool. Not sure if this is necessary.
> > > //arrow::MemoryPool* pool2 = arrow::default_memory_pool();
> > > // Open file reader.
> > > std::shared_ptrarrow::io::ReadableFile input_file;
> > > st = arrow::io::ReadableFile::Open(file_name, pool, &input_file);
> > > if (!st.ok()) return 1;
> > > std::unique_ptrparquet::arrow::FileReader reader;
> > > st = parquet::arrow::OpenFile(input_file, pool, &reader);
> > > if (!st.ok()) return 1;
> > > // Get schema and read metadata.
> > > std::shared_ptrarrow::Schema new_schema;
> > > st = reader->GetSchema(&new_schema);
> > > if (!st.ok()) return 1;
> > > printf("\nMetadata from schema read from file:\n");
> > > printf("%s\n", new_schema->metadata()->ToString().c_str());
> > > // Crashes because there are no metadata.
> > > /printf("%s\n", new_schema->field(0)->metadata()->ToString().c_str());
> > > printf("%s\n", 
> > > new_schema->field(1)->metadata()->ToString().c_str());/printf("field name 
> > > %s metadata exists: %d\n", new_schema->field(0)->name().c_str(),
> > > new_schema->field(0)->HasMetadata());
> > > printf("field name %s metadata exists: %d\n", 
> > > new_schema->field(1)->name().c_str(),
> > > new_schema->field(1)->HasMetadata());
> > > // What if I read the whole table and get the schema from it.
> > > std::shared_ptrarrow::Table new_table;
> > > st = reader->ReadTable(&new_table);
> > > if (!st.ok()) return 1;
> > > std::shared_ptrarrow::Schema schema_from_table = new_table->schema();
> > > printf("\nMetadata from schema that is retrieved through table that is 
> > > read from file:\n");
> > > printf("%s\n", schema_from_table->metadata()->ToString().c_str());
> > > // Crashes because there are no metadata.
> > > /printf("%s\n", 
> > > schema_from_table->field(0)->metadata()->ToString().c_str());
> > > printf("%s\n", 
> > > schema_from_table->field(1)->metadata()->ToString().c_str());/printf("field
> > >  name %s metadata exists: %d\n", 
> > > schema_from_table->field(0)->name().c_str(),
> > > schema_from_table->field(0)->HasMetadata());
> > > printf("field name %s metadata exists: %d\n", 
> > > schema_from_table->field(1)->name().c_str(),
> > > schema_from_table->field(1)->HasMetadata());
> > > st = input_file->Close();
> > > if (!st.ok()) return 1;
> > > return 0;
> > > }
> > > </code>
> > > Sent with ProtonMail Secure Email.
>
>

Reply via email to