wgtmac commented on code in PR #34054:
URL: https://github.com/apache/arrow/pull/34054#discussion_r1133277991


##########
cpp/src/parquet/page_index_test.cc:
##########
@@ -416,4 +416,420 @@ TEST(PageIndex, 
DeterminePageIndexRangesInRowGroupWithMissingPageIndex) {
                          -1);
 }
 
+TEST(PageIndex, WriteOffsetIndex) {
+  /// Create offset index via the OffsetIndexBuilder interface.
+  auto builder = OffsetIndexBuilder::Make();
+  const size_t num_pages = 5;
+  const std::vector<int64_t> offsets = {100, 200, 300, 400, 500};
+  const std::vector<int32_t> page_sizes = {1024, 2048, 3072, 4096, 8192};
+  const std::vector<int64_t> first_row_indices = {0, 10000, 20000, 30000, 
40000};
+  for (size_t i = 0; i < num_pages; ++i) {
+    builder->AddPage(offsets[i], page_sizes[i], first_row_indices[i]);
+  }
+  const int64_t final_position = 4096;
+  builder->Finish(final_position);
+
+  std::vector<std::unique_ptr<OffsetIndex>> offset_indexes;
+  /// 1st element is the offset index just built.
+  offset_indexes.emplace_back(builder->Build());
+  /// 2nd element is the offset index restored by serialize-then-deserialize 
round trip.
+  auto sink = CreateOutputStream();
+  builder->WriteTo(sink.get());
+  PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish());
+  offset_indexes.emplace_back(OffsetIndex::Make(buffer->data(),
+                                                
static_cast<uint32_t>(buffer->size()),
+                                                default_reader_properties()));
+
+  /// Verify the data of the offset index.
+  for (const auto& offset_index : offset_indexes) {
+    ASSERT_EQ(num_pages, offset_index->page_locations().size());
+    for (size_t i = 0; i < num_pages; ++i) {
+      const auto& page_location = offset_index->page_locations().at(i);
+      ASSERT_EQ(offsets[i] + final_position, page_location.offset);
+      ASSERT_EQ(page_sizes[i], page_location.compressed_page_size);
+      ASSERT_EQ(first_row_indices[i], page_location.first_row_index);
+    }
+  }
+}
+
+void TestWriteTypedColumnIndex(schema::NodePtr node,
+                               const std::vector<EncodedStatistics>& 
page_stats,
+                               BoundaryOrder::type boundary_order, bool 
has_null_counts) {
+  auto descr = std::make_unique<ColumnDescriptor>(node, 
/*max_definition_level=*/1, 0);
+
+  auto builder = ColumnIndexBuilder::Make(descr.get());
+  for (const auto& stats : page_stats) {
+    builder->AddPage(stats);
+  }
+  ASSERT_NO_THROW(builder->Finish());
+
+  std::vector<std::unique_ptr<ColumnIndex>> column_indexes;
+  /// 1st element is the column index just built.
+  column_indexes.emplace_back(builder->Build());
+  /// 2nd element is the column index restored by serialize-then-deserialize 
round trip.
+  auto sink = CreateOutputStream();
+  builder->WriteTo(sink.get());
+  PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish());
+  column_indexes.emplace_back(ColumnIndex::Make(*descr, buffer->data(),
+                                                
static_cast<uint32_t>(buffer->size()),
+                                                default_reader_properties()));
+
+  /// Verify the data of the column index.
+  for (const auto& column_index : column_indexes) {
+    ASSERT_EQ(boundary_order, column_index->boundary_order());
+    ASSERT_EQ(has_null_counts, column_index->has_null_counts());
+    const size_t num_pages = column_index->null_pages().size();
+    for (size_t i = 0; i < num_pages; ++i) {
+      ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]);
+      ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]);
+      ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]);
+      if (has_null_counts) {
+        ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]);
+      }
+    }
+  }
+}
+
+TEST(PageIndex, WriteInt32ColumnIndex) {
+  auto encode = [=](int32_t value) {
+    return std::string(reinterpret_cast<const char*>(&value), sizeof(int32_t));
+  };
+
+  // Integer values in the ascending order.
+  std::vector<EncodedStatistics> page_stats(3);
+  page_stats.at(0).set_null_count(1).set_min(encode(1)).set_max(encode(2));
+  page_stats.at(1).set_null_count(2).set_min(encode(2)).set_max(encode(3));
+  page_stats.at(2).set_null_count(3).set_min(encode(3)).set_max(encode(4));
+
+  TestWriteTypedColumnIndex(schema::Int32("c1"), page_stats, 
BoundaryOrder::Ascending,
+                            /*has_null_counts=*/true);
+}
+
+TEST(PageIndex, WriteInt64ColumnIndex) {
+  auto encode = [=](int64_t value) {
+    return std::string(reinterpret_cast<const char*>(&value), sizeof(int64_t));
+  };
+
+  // Integer values in the descending order.
+  std::vector<EncodedStatistics> page_stats(3);
+  page_stats.at(0).set_null_count(4).set_min(encode(-1)).set_max(encode(-2));
+  page_stats.at(1).set_null_count(0).set_min(encode(-2)).set_max(encode(-3));
+  page_stats.at(2).set_null_count(4).set_min(encode(-3)).set_max(encode(-4));
+
+  TestWriteTypedColumnIndex(schema::Int64("c1"), page_stats, 
BoundaryOrder::Descending,
+                            /*has_null_counts=*/true);
+}
+
+TEST(PageIndex, WriteFloatColumnIndex) {
+  auto encode = [=](float value) {
+    return std::string(reinterpret_cast<const char*>(&value), sizeof(float));
+  };
+
+  // Float values with no specific order.
+  std::vector<EncodedStatistics> page_stats(3);
+  
page_stats.at(0).set_null_count(0).set_min(encode(2.2F)).set_max(encode(4.4F));

Review Comment:
   There is already some logic handling NaN values: 
https://github.com/apache/arrow/blob/main/cpp/src/parquet/statistics.cc#L300. I 
will look into it later to see if more actions are required.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to