mapleFU commented on issue #43682:
URL: https://github.com/apache/arrow/issues/43682#issuecomment-2314493678

   So sorry to be misleading...
   
   ```
   void WriteParquetFile() {
     std::shared_ptr<arrow::io::FileOutputStream> outfile;
   
     PARQUET_ASSIGN_OR_THROW(
         outfile, 
arrow::io::FileOutputStream::Open("parquet-stream-api-example.parquet"));
   
     parquet::WriterProperties::Builder builder;
   
     // schema and builder
     auto arrow_schema =
         arrow::schema({
                        arrow::field("foo", 
arrow::timestamp(arrow::TimeUnit::MILLI)),
                        arrow::field("bar", dictionary(arrow::int8(), 
arrow::utf8())),
                        arrow::field("baz", arrow::float64())
        });
   
   #if defined ARROW_WITH_BROTLI
     builder.compression(parquet::Compression::BROTLI);
   #elif defined ARROW_WITH_ZSTD
     builder.compression(parquet::Compression::ZSTD);
   #endif
   
     // parquet schema
     auto fields = parquet::schema::NodeVector{};
     fields.push_back(parquet::schema::PrimitiveNode::Make(
             "foo",
             parquet::Repetition::REQUIRED,
             parquet::Type::INT64,
             parquet::ConvertedType::INT_64)
     );
     fields.push_back(parquet::schema::PrimitiveNode::Make(
             "bar",
             parquet::Repetition::REQUIRED,
             parquet::LogicalType::Timestamp(false, 
parquet::LogicalType::TimeUnit::MICROS, false, true),
             parquet::Type::INT64)
     );
     fields.push_back(parquet::schema::PrimitiveNode::Make(
             "baz",
             parquet::Repetition::REQUIRED,
             parquet::LogicalType::String(),
             parquet::Type::BYTE_ARRAY)
     );
     auto parquet_schema = 
std::static_pointer_cast<parquet::schema::GroupNode>(parquet::schema::GroupNode::Make("schema",
 parquet::Repetition::REQUIRED, fields));
     auto parquet_writer = parquet::ParquetFileWriter::Open(outfile, 
parquet_schema, builder.build());
     auto serialized_schema =
                           ::arrow::ipc::SerializeSchema(*arrow_schema, 
::arrow::default_memory_pool()).ValueOrDie();
   
     // The serialized schema is not UTF-8, which is required for Thrift
     std::string schema_as_string = serialized_schema->ToString();
     std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
     
parquet_writer->AddKeyValueMetadata(::arrow::KeyValueMetadata::Make({"ARROW:schema"},
 {schema_base64}));
   
     parquet::StreamWriter os{std::move(parquet_writer)};
   
     os.SetMaxRowGroupSize(1000);
   
     for (auto i = 0; i < TestData::num_rows; ++i) {
       os << int64_t(1000000);
       os << UserTimestamp{std::chrono::milliseconds{1000000 * i}};
       os << "def";
       os << parquet::EndRow;
   
       if (i == TestData::num_rows / 2) {
         os << parquet::EndRowGroup;
       }
     }
     std::cout << "Parquet Stream Writing complete." << std::endl;
   }
   ```
   
   @adampinky85 The core point here is
   
   ```
     // The serialized schema is not UTF-8, which is required for Thrift
     std::string schema_as_string = serialized_schema->ToString();
     std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
     
parquet_writer->AddKeyValueMetadata(::arrow::KeyValueMetadata::Make({"ARROW:schema"},
 {schema_base64}));
   ```
   
   It's a hack, but I think it could be use
   
   > With the the streaming API, see top of the issue, we're unable to set 
dictionary fields. I would have thought it will always store these fields as 
byte array strings rather than integers with dictionaries?
   
   Nope...This is hack in arrow. Parquet dictionary is **unrelated** to 
dictionary schema. This schema is set by arrow parquet writer. The stream 
writer lacks this.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to