wgtmac commented on code in PR #47775:
URL: https://github.com/apache/arrow/pull/47775#discussion_r2425149861
##########
cpp/src/parquet/properties.h:
##########
@@ -1409,4 +1409,74 @@ struct ArrowWriteContext {
PARQUET_EXPORT
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
+class PARQUET_EXPORT RewriterProperties {
+ public:
+ class Builder {
+ public:
+ Builder()
+ : pool_(::arrow::default_memory_pool()),
+ writer_properties_(default_writer_properties()),
+ reader_properties_(default_reader_properties()) {}
+
+ explicit Builder(const RewriterProperties& properties)
Review Comment:
Let's start simple and remove this?
##########
cpp/src/parquet/platform.h:
##########
@@ -109,4 +109,9 @@ PARQUET_EXPORT
std::shared_ptr<ResizableBuffer> AllocateBuffer(
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size =
0);
+PARQUET_EXPORT
+void CopyStream(std::shared_ptr<ArrowInputStream> from,
Review Comment:
This function looks too specialized. Should we just define it in the
file_rewriter.cc?
##########
cpp/src/parquet/page_index.h:
##########
@@ -86,6 +87,8 @@ class PARQUET_EXPORT ColumnIndex {
/// \brief List of repetition level histograms for each page concatenated
together.
virtual const std::vector<int64_t>& repetition_level_histograms() const = 0;
+
+ virtual const void* to_thrift() const = 0;
Review Comment:
I'm not sure if this is on the critical path. Is it simpler and cleaner to
convert `ColumnIndex` and `OffsetIndex` to their thrift equivalents?
##########
cpp/src/parquet/page_index.cc:
##########
@@ -208,9 +212,12 @@ class OffsetIndexImpl : public OffsetIndex {
return unencoded_byte_array_data_bytes_;
}
+ const void* to_thrift() const override { return &offset_index_; }
+
private:
std::vector<PageLocation> page_locations_;
std::vector<int64_t> unencoded_byte_array_data_bytes_;
+ format::OffsetIndex offset_index_;
Review Comment:
This approach may double the memory consumption. If we really want to go
with this approach, getters of `OffsetIndexImpl` should try to return values
directly from `offset_index_` and remove `page_locations_`.
##########
cpp/src/parquet/properties.h:
##########
@@ -1409,4 +1409,74 @@ struct ArrowWriteContext {
PARQUET_EXPORT
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
+class PARQUET_EXPORT RewriterProperties {
+ public:
+ class Builder {
+ public:
+ Builder()
+ : pool_(::arrow::default_memory_pool()),
Review Comment:
What about initializing them to nullptr and only assign to default values
when they are not provided in `build()`?
##########
cpp/src/parquet/properties.h:
##########
@@ -1409,4 +1409,74 @@ struct ArrowWriteContext {
PARQUET_EXPORT
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
+class PARQUET_EXPORT RewriterProperties {
+ public:
+ class Builder {
+ public:
+ Builder()
+ : pool_(::arrow::default_memory_pool()),
+ writer_properties_(default_writer_properties()),
+ reader_properties_(default_reader_properties()) {}
+
+ explicit Builder(const RewriterProperties& properties)
+ : pool_(properties.memory_pool()),
+ writer_properties_(properties.writer_properties()),
+ reader_properties_(properties.reader_properties()) {}
+
+ virtual ~Builder() = default;
+
+ /// Specify the memory pool for the rewriter. Default default_memory_pool.
+ Builder* memory_pool(MemoryPool* pool) {
+ pool_ = pool;
+ return this;
+ }
+
+ /// Set the writer properties.
+ Builder* writer_properties(std::shared_ptr<WriterProperties> properties) {
+ writer_properties_ = std::move(properties);
+ return this;
+ }
+
+ /// Set the reader properties.
+ Builder* reader_properties(ReaderProperties properties) {
+ reader_properties_ = std::move(properties);
+ return this;
+ }
+
+ /// Build the RewriterProperties with the builder parameters.
+ std::shared_ptr<RewriterProperties> build() {
+ return std::shared_ptr<RewriterProperties>(
+ new RewriterProperties(pool_, writer_properties_,
reader_properties_));
+ }
+
+ private:
+ MemoryPool* pool_;
+ std::shared_ptr<WriterProperties> writer_properties_;
+ ReaderProperties reader_properties_;
Review Comment:
It looks strange that one is a shared_ptr but the other isn't.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]