Re: [PR] GH-47628: [C++][Parquet] Implement basic parquet file rewriter [arrow]

via GitHub Fri, 13 Feb 2026 04:03:07 -0800


Copilot commented on code in PR #47775:
URL: https://github.com/apache/arrow/pull/47775#discussion_r2803860949



##########
cpp/src/parquet/file_rewriter.cc:
##########
@@ -0,0 +1,811 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_rewriter.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <ranges>
+#include <sstream>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/util/compression.h"
+#include "arrow/util/logging.h"
+#include "parquet/bloom_filter.h"  // IWYU pragma: keep
+#include "parquet/bloom_filter_reader.h"
+#include "parquet/bloom_filter_writer.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/column_writer.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/file_writer.h"
+#include "parquet/index_location.h"
+#include "parquet/metadata.h"
+#include "parquet/page_index.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace {
+void CopyStream(std::shared_ptr<ArrowInputStream> from,
+                std::shared_ptr<ArrowOutputStream> to, int64_t size,
+                ::arrow::MemoryPool* pool) {
+  int64_t bytes_copied = 0;
+  if (from->supports_zero_copy()) {
+    while (bytes_copied < size) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, from->Read(size - bytes_copied));
+      if (buffer->size() == 0) {
+        throw ParquetException("Unexpected end of stream at ", bytes_copied);
+      }
+      PARQUET_THROW_NOT_OK(to->Write(buffer->data(), buffer->size()));
+      bytes_copied += buffer->size();
+    }
+    return;
+  }
+
+  std::shared_ptr<ResizableBuffer> buffer =
+      AllocateBuffer(pool, kDefaultOutputStreamSize);
+  while (bytes_copied < size) {
+    PARQUET_ASSIGN_OR_THROW(
+        auto read_size,
+        from->Read(std::min(size - bytes_copied, buffer->size()), &buffer));
+    if (read_size == 0) {
+      throw ParquetException("Unexpected end of stream at ", bytes_copied);
+    }
+    PARQUET_THROW_NOT_OK(to->Write(buffer->data(), read_size));
+    bytes_copied += read_size;
+  }
+}
+}  // namespace
+
+const std::shared_ptr<RewriterProperties>& default_rewriter_properties() {
+  static std::shared_ptr<RewriterProperties> default_rewriter_properties =
+      RewriterProperties::Builder().build();
+  return default_rewriter_properties;
+}
+
+class PagesRewriter {
+ public:
+  PagesRewriter(const RewriterProperties& props, std::unique_ptr<PageReader> 
page_reader,
+                std::unique_ptr<PageWriter> page_writer,
+                std::shared_ptr<OffsetIndex> original_offset_index)
+      : props_(props),
+        page_reader_(std::move(page_reader)),
+        page_writer_(std::move(page_writer)),
+        original_offset_index_(std::move(original_offset_index)) {}
+
+  void WritePages() {
+    bool has_dictionary = false;
+    bool fallback = false;
+    std::shared_ptr<Page> page;
+    size_t page_no = 0;
+    while ((page = page_reader_->NextPage()) != nullptr) {
+      switch (page->type()) {
+        case parquet::PageType::DICTIONARY_PAGE: {
+          WriteDictionaryPage(*static_cast<const DictionaryPage*>(page.get()));
+          has_dictionary = true;
+          break;
+        }
+        case parquet::PageType::DATA_PAGE: {
+          auto& data_page = *static_cast<const DataPageV1*>(page.get());
+          if (data_page.encoding() != Encoding::PLAIN_DICTIONARY) {
+            fallback = true;
+          }
+          WriteDataPageV1(data_page, page_no);
+          page_no++;
+          break;
+        }
+        case parquet::PageType::DATA_PAGE_V2: {
+          auto& data_page = *static_cast<const DataPageV2*>(page.get());
+          if (data_page.encoding() != Encoding::PLAIN_DICTIONARY) {
+            fallback = true;
+          }
+          WriteDataPageV2(data_page, page_no);

Review Comment:
   Dictionary fallback detection is incorrect for dictionary-encoded data 
pages: DataPageV2 commonly uses `Encoding::RLE_DICTIONARY` (and V1 can also use 
it), but the code treats anything other than `PLAIN_DICTIONARY` as fallback. 
This will mark `dictionary_fallback` true even when dictionary encoding is used 
normally. Consider using `IsDictionaryIndexEncoding(data_page.encoding())` 
(types.h) and only setting fallback when a dictionary page was seen and a 
subsequent data page is *not* a dictionary index encoding.



##########
cpp/src/parquet/page_index.cc:
##########
@@ -31,6 +31,7 @@
 #include "arrow/util/unreachable.h"
 
 #include <limits>
+#include <memory>
 #include <numeric>

Review Comment:
   This file now uses `std::declval` and `std::is_same_v` in `SerializeIndex`, 
but it doesn't include the standard headers that define them (`<utility>` for 
`declval`, `<type_traits>` for `is_same_v`). Please add those includes 
explicitly rather than relying on transitive includes.
   ```suggestion
   #include <numeric>
   #include <type_traits>
   #include <utility>
   ```



##########
cpp/src/parquet/file_rewriter.cc:
##########
@@ -0,0 +1,811 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_rewriter.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <ranges>
+#include <sstream>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/util/compression.h"
+#include "arrow/util/logging.h"
+#include "parquet/bloom_filter.h"  // IWYU pragma: keep
+#include "parquet/bloom_filter_reader.h"
+#include "parquet/bloom_filter_writer.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/column_writer.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/file_writer.h"
+#include "parquet/index_location.h"
+#include "parquet/metadata.h"
+#include "parquet/page_index.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace {
+void CopyStream(std::shared_ptr<ArrowInputStream> from,
+                std::shared_ptr<ArrowOutputStream> to, int64_t size,
+                ::arrow::MemoryPool* pool) {
+  int64_t bytes_copied = 0;
+  if (from->supports_zero_copy()) {
+    while (bytes_copied < size) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, from->Read(size - bytes_copied));
+      if (buffer->size() == 0) {
+        throw ParquetException("Unexpected end of stream at ", bytes_copied);
+      }
+      PARQUET_THROW_NOT_OK(to->Write(buffer->data(), buffer->size()));
+      bytes_copied += buffer->size();
+    }
+    return;
+  }
+
+  std::shared_ptr<ResizableBuffer> buffer =
+      AllocateBuffer(pool, kDefaultOutputStreamSize);
+  while (bytes_copied < size) {
+    PARQUET_ASSIGN_OR_THROW(
+        auto read_size,
+        from->Read(std::min(size - bytes_copied, buffer->size()), &buffer));
+    if (read_size == 0) {
+      throw ParquetException("Unexpected end of stream at ", bytes_copied);
+    }
+    PARQUET_THROW_NOT_OK(to->Write(buffer->data(), read_size));
+    bytes_copied += read_size;
+  }
+}
+}  // namespace
+
+const std::shared_ptr<RewriterProperties>& default_rewriter_properties() {
+  static std::shared_ptr<RewriterProperties> default_rewriter_properties =
+      RewriterProperties::Builder().build();
+  return default_rewriter_properties;
+}
+
+class PagesRewriter {
+ public:
+  PagesRewriter(const RewriterProperties& props, std::unique_ptr<PageReader> 
page_reader,
+                std::unique_ptr<PageWriter> page_writer,
+                std::shared_ptr<OffsetIndex> original_offset_index)
+      : props_(props),
+        page_reader_(std::move(page_reader)),
+        page_writer_(std::move(page_writer)),
+        original_offset_index_(std::move(original_offset_index)) {}
+
+  void WritePages() {
+    bool has_dictionary = false;
+    bool fallback = false;
+    std::shared_ptr<Page> page;
+    size_t page_no = 0;
+    while ((page = page_reader_->NextPage()) != nullptr) {
+      switch (page->type()) {
+        case parquet::PageType::DICTIONARY_PAGE: {
+          WriteDictionaryPage(*static_cast<const DictionaryPage*>(page.get()));
+          has_dictionary = true;
+          break;
+        }
+        case parquet::PageType::DATA_PAGE: {
+          auto& data_page = *static_cast<const DataPageV1*>(page.get());
+          if (data_page.encoding() != Encoding::PLAIN_DICTIONARY) {
+            fallback = true;
+          }
+          WriteDataPageV1(data_page, page_no);
+          page_no++;
+          break;
+        }
+        case parquet::PageType::DATA_PAGE_V2: {
+          auto& data_page = *static_cast<const DataPageV2*>(page.get());
+          if (data_page.encoding() != Encoding::PLAIN_DICTIONARY) {
+            fallback = true;
+          }
+          WriteDataPageV2(data_page, page_no);
+          page_no++;
+          break;
+        }
+        default: {
+          ARROW_LOG(DEBUG) << "Unsupported page type: " << 
static_cast<int>(page->type());
+          break;
+        }
+      }
+    }
+    page_writer_->Close(has_dictionary, has_dictionary && fallback);
+  }
+
+  int64_t total_uncompressed_size() const { return total_uncompressed_size_; }
+
+ private:
+  void WriteDictionaryPage(const DictionaryPage& dict_page) {
+    total_uncompressed_size_ += page_writer_->WriteDictionaryPage(dict_page);
+  }
+
+  void WriteDataPageV1(const DataPageV1& data_page, const size_t page_no) {
+    std::shared_ptr<Buffer> compressed_data;
+    if (page_writer_->has_compressor()) {
+      auto buffer = std::static_pointer_cast<ResizableBuffer>(
+          AllocateBuffer(props_.memory_pool(), data_page.size()));
+      page_writer_->Compress(*data_page.buffer(), buffer.get());
+      compressed_data = std::move(buffer);
+    } else {
+      compressed_data = data_page.buffer();
+    }
+    auto first_row_index =
+        original_offset_index_
+            ? std::optional{original_offset_index_->page_locations()[page_no]
+                                .first_row_index}
+            : std::nullopt;
+    SizeStatistics size_statistics;
+    size_statistics.unencoded_byte_array_data_bytes =
+        original_offset_index_ &&
+                
!original_offset_index_->unencoded_byte_array_data_bytes().empty()
+            ? std::optional{original_offset_index_
+                                ->unencoded_byte_array_data_bytes()[page_no]}
+            : std::nullopt;
+    DataPageV1 new_page(compressed_data, data_page.num_values(), 
data_page.encoding(),
+                        data_page.definition_level_encoding(),
+                        data_page.repetition_level_encoding(),
+                        data_page.uncompressed_size(), data_page.statistics(),
+                        first_row_index, size_statistics);
+    total_uncompressed_size_ += page_writer_->WriteDataPage(new_page);
+  }
+
+  void WriteDataPageV2(const DataPageV2& data_page, const size_t page_no) {
+    int32_t levels_byte_len = data_page.repetition_levels_byte_length() +
+                              data_page.definition_levels_byte_length();
+    bool page_is_compressed = false;
+    std::shared_ptr<Buffer> output_buffer;
+    if (page_writer_->has_compressor() && data_page.size() > levels_byte_len) {
+      auto values_buffer = SliceBuffer(data_page.buffer(), levels_byte_len);
+      auto compressed_values = std::static_pointer_cast<ResizableBuffer>(
+          AllocateBuffer(props_.memory_pool(), values_buffer->size()));
+      page_writer_->Compress(*values_buffer, compressed_values.get());
+      if (compressed_values->size() < values_buffer->size()) {
+        page_is_compressed = true;
+        int64_t combined_size = levels_byte_len + compressed_values->size();
+        auto combined = AllocateBuffer(props_.memory_pool(), combined_size);
+        if (levels_byte_len > 0) {
+          std::memcpy(combined->mutable_data(), data_page.data(), 
levels_byte_len);
+        }
+        std::memcpy(combined->mutable_data() + levels_byte_len, 
compressed_values->data(),
+                    compressed_values->size());
+        output_buffer = std::move(combined);
+      }
+    }
+    if (!page_is_compressed) {
+      output_buffer = data_page.buffer();
+    }
+
+    auto first_row_index =
+        original_offset_index_
+            ? std::optional{original_offset_index_->page_locations()[page_no]
+                                .first_row_index}
+            : std::nullopt;
+    SizeStatistics size_statistics;
+    size_statistics.unencoded_byte_array_data_bytes =
+        original_offset_index_ &&
+                
!original_offset_index_->unencoded_byte_array_data_bytes().empty()
+            ? std::optional{original_offset_index_
+                                ->unencoded_byte_array_data_bytes()[page_no]}
+            : std::nullopt;
+    DataPageV2 new_page(output_buffer, data_page.num_values(), 
data_page.num_nulls(),
+                        data_page.num_rows(), data_page.encoding(),
+                        data_page.definition_levels_byte_length(),
+                        data_page.repetition_levels_byte_length(),
+                        data_page.uncompressed_size(), page_is_compressed,
+                        data_page.statistics(), first_row_index, 
size_statistics);
+    total_uncompressed_size_ += page_writer_->WriteDataPage(new_page);
+  }
+
+  const RewriterProperties& props_;
+  std::unique_ptr<PageReader> page_reader_;
+  std::unique_ptr<PageWriter> page_writer_;
+  std::shared_ptr<OffsetIndex> original_offset_index_;
+  int64_t total_uncompressed_size_{0};
+};
+
+class ColumnChunkRewriter {
+ public:
+  ColumnChunkRewriter(std::shared_ptr<ArrowInputFile> source,
+                      std::shared_ptr<ArrowOutputStream> sink,
+                      const RewriterProperties& props,
+                      std::unique_ptr<ColumnChunkMetaData> metadata,
+                      std::shared_ptr<RowGroupPageIndexReader> 
page_index_reader,
+                      std::shared_ptr<RowGroupBloomFilterReader> 
bloom_filter_reader,
+                      int32_t row_group_ordinal, int32_t column_ordinal, bool 
fast_copy)
+      : source_(std::move(source)),
+        sink_(std::move(sink)),
+        props_(props),
+        metadata_(std::move(metadata)),
+        page_index_reader_(std::move(page_index_reader)),
+        bloom_filter_reader_(std::move(bloom_filter_reader)),
+        row_group_ordinal_(row_group_ordinal),
+        column_ordinal_(column_ordinal),
+        fast_copy_(fast_copy) {
+    if (metadata_ == nullptr) {
+      throw ParquetException("ColumnChunkMetaData should not be nullptr");
+    }
+  }
+
+  static bool CanFastCopy(const RewriterProperties& props,
+                          const ColumnChunkMetaData& metadata) {
+    Compression::type original_codec = metadata.compression();
+    auto column_path = metadata.path_in_schema();
+    Compression::type new_codec = 
props.writer_properties()->compression(column_path);
+    return (original_codec == new_codec);
+  }
+
+  void WriteColumnChunkData(RowGroupMetaDataBuilder& rg_metadata_builder,
+                            PageIndexBuilder* page_index_builder,
+                            BloomFilterBuilder* bloom_filter_builder,
+                            int64_t& total_bytes_written) {
+    auto& reader_props = props_.reader_properties();
+    auto& writer_props = *props_.writer_properties();
+    auto stream = reader_props.GetStream(source_, metadata_->start_offset(),
+                                         metadata_->total_compressed_size());
+
+    if (fast_copy_) {
+      PARQUET_ASSIGN_OR_THROW(int64_t sink_offset, sink_->Tell());
+      int64_t shift = sink_offset - metadata_->start_offset();
+
+      CopyStream(stream, sink_, metadata_->total_compressed_size(), 
props_.memory_pool());
+      PARQUET_THROW_NOT_OK(stream->Close());
+
+      rg_metadata_builder.NextColumnChunk(std::move(metadata_), shift);
+
+      if (page_index_reader_ != nullptr && page_index_builder != nullptr) {
+        auto offset_index = 
page_index_reader_->GetOffsetIndex(column_ordinal_);
+        if (offset_index != nullptr) {
+          page_index_builder->SetOffsetIndex(column_ordinal_, offset_index, 
shift);
+        }
+      }
+
+      total_bytes_written += metadata_->total_uncompressed_size();
+    } else {

Review Comment:
   In the fast-copy branch, `metadata_` is moved into 
`rg_metadata_builder.NextColumnChunk(std::move(metadata_), shift);` and then 
immediately dereferenced (`metadata_->total_uncompressed_size()`). This is a 
use-after-move and will likely crash. Capture `total_uncompressed_size()` (and 
any other needed fields) before moving `metadata_`, or compute 
`total_bytes_written` from the copied thrift metadata inside the builder.



##########
cpp/src/parquet/file_rewriter.cc:
##########
@@ -0,0 +1,811 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_rewriter.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <ranges>
+#include <sstream>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/util/compression.h"
+#include "arrow/util/logging.h"
+#include "parquet/bloom_filter.h"  // IWYU pragma: keep
+#include "parquet/bloom_filter_reader.h"
+#include "parquet/bloom_filter_writer.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/column_writer.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/file_writer.h"
+#include "parquet/index_location.h"
+#include "parquet/metadata.h"
+#include "parquet/page_index.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace {
+void CopyStream(std::shared_ptr<ArrowInputStream> from,
+                std::shared_ptr<ArrowOutputStream> to, int64_t size,
+                ::arrow::MemoryPool* pool) {
+  int64_t bytes_copied = 0;
+  if (from->supports_zero_copy()) {
+    while (bytes_copied < size) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, from->Read(size - bytes_copied));
+      if (buffer->size() == 0) {
+        throw ParquetException("Unexpected end of stream at ", bytes_copied);
+      }
+      PARQUET_THROW_NOT_OK(to->Write(buffer->data(), buffer->size()));
+      bytes_copied += buffer->size();
+    }
+    return;
+  }
+
+  std::shared_ptr<ResizableBuffer> buffer =
+      AllocateBuffer(pool, kDefaultOutputStreamSize);
+  while (bytes_copied < size) {
+    PARQUET_ASSIGN_OR_THROW(
+        auto read_size,
+        from->Read(std::min(size - bytes_copied, buffer->size()), &buffer));
+    if (read_size == 0) {
+      throw ParquetException("Unexpected end of stream at ", bytes_copied);
+    }
+    PARQUET_THROW_NOT_OK(to->Write(buffer->data(), read_size));
+    bytes_copied += read_size;
+  }
+}
+}  // namespace
+
+const std::shared_ptr<RewriterProperties>& default_rewriter_properties() {
+  static std::shared_ptr<RewriterProperties> default_rewriter_properties =
+      RewriterProperties::Builder().build();
+  return default_rewriter_properties;
+}
+
+class PagesRewriter {
+ public:
+  PagesRewriter(const RewriterProperties& props, std::unique_ptr<PageReader> 
page_reader,
+                std::unique_ptr<PageWriter> page_writer,
+                std::shared_ptr<OffsetIndex> original_offset_index)
+      : props_(props),
+        page_reader_(std::move(page_reader)),
+        page_writer_(std::move(page_writer)),
+        original_offset_index_(std::move(original_offset_index)) {}
+
+  void WritePages() {
+    bool has_dictionary = false;
+    bool fallback = false;
+    std::shared_ptr<Page> page;
+    size_t page_no = 0;
+    while ((page = page_reader_->NextPage()) != nullptr) {
+      switch (page->type()) {
+        case parquet::PageType::DICTIONARY_PAGE: {
+          WriteDictionaryPage(*static_cast<const DictionaryPage*>(page.get()));
+          has_dictionary = true;
+          break;
+        }
+        case parquet::PageType::DATA_PAGE: {
+          auto& data_page = *static_cast<const DataPageV1*>(page.get());
+          if (data_page.encoding() != Encoding::PLAIN_DICTIONARY) {
+            fallback = true;
+          }
+          WriteDataPageV1(data_page, page_no);
+          page_no++;
+          break;
+        }
+        case parquet::PageType::DATA_PAGE_V2: {
+          auto& data_page = *static_cast<const DataPageV2*>(page.get());
+          if (data_page.encoding() != Encoding::PLAIN_DICTIONARY) {
+            fallback = true;
+          }
+          WriteDataPageV2(data_page, page_no);
+          page_no++;
+          break;
+        }
+        default: {
+          ARROW_LOG(DEBUG) << "Unsupported page type: " << 
static_cast<int>(page->type());
+          break;
+        }
+      }
+    }
+    page_writer_->Close(has_dictionary, has_dictionary && fallback);
+  }
+
+  int64_t total_uncompressed_size() const { return total_uncompressed_size_; }
+
+ private:
+  void WriteDictionaryPage(const DictionaryPage& dict_page) {
+    total_uncompressed_size_ += page_writer_->WriteDictionaryPage(dict_page);
+  }
+
+  void WriteDataPageV1(const DataPageV1& data_page, const size_t page_no) {
+    std::shared_ptr<Buffer> compressed_data;
+    if (page_writer_->has_compressor()) {
+      auto buffer = std::static_pointer_cast<ResizableBuffer>(
+          AllocateBuffer(props_.memory_pool(), data_page.size()));
+      page_writer_->Compress(*data_page.buffer(), buffer.get());
+      compressed_data = std::move(buffer);
+    } else {
+      compressed_data = data_page.buffer();
+    }
+    auto first_row_index =
+        original_offset_index_
+            ? std::optional{original_offset_index_->page_locations()[page_no]
+                                .first_row_index}
+            : std::nullopt;
+    SizeStatistics size_statistics;
+    size_statistics.unencoded_byte_array_data_bytes =
+        original_offset_index_ &&
+                
!original_offset_index_->unencoded_byte_array_data_bytes().empty()
+            ? std::optional{original_offset_index_
+                                ->unencoded_byte_array_data_bytes()[page_no]}
+            : std::nullopt;
+    DataPageV1 new_page(compressed_data, data_page.num_values(), 
data_page.encoding(),
+                        data_page.definition_level_encoding(),
+                        data_page.repetition_level_encoding(),
+                        data_page.uncompressed_size(), data_page.statistics(),
+                        first_row_index, size_statistics);
+    total_uncompressed_size_ += page_writer_->WriteDataPage(new_page);
+  }
+
+  void WriteDataPageV2(const DataPageV2& data_page, const size_t page_no) {
+    int32_t levels_byte_len = data_page.repetition_levels_byte_length() +
+                              data_page.definition_levels_byte_length();
+    bool page_is_compressed = false;
+    std::shared_ptr<Buffer> output_buffer;
+    if (page_writer_->has_compressor() && data_page.size() > levels_byte_len) {
+      auto values_buffer = SliceBuffer(data_page.buffer(), levels_byte_len);
+      auto compressed_values = std::static_pointer_cast<ResizableBuffer>(
+          AllocateBuffer(props_.memory_pool(), values_buffer->size()));
+      page_writer_->Compress(*values_buffer, compressed_values.get());
+      if (compressed_values->size() < values_buffer->size()) {
+        page_is_compressed = true;
+        int64_t combined_size = levels_byte_len + compressed_values->size();
+        auto combined = AllocateBuffer(props_.memory_pool(), combined_size);
+        if (levels_byte_len > 0) {
+          std::memcpy(combined->mutable_data(), data_page.data(), 
levels_byte_len);
+        }
+        std::memcpy(combined->mutable_data() + levels_byte_len, 
compressed_values->data(),

Review Comment:
   `std::memcpy` is used here, but the file doesn't include `<cstring>`. This 
relies on transitive includes and may fail to compile under stricter header 
hygiene. Add an explicit `#include <cstring>` in this translation unit.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-47628: [C++][Parquet] Implement basic parquet file rewriter [arrow]

Reply via email to