mapleFU commented on code in PR #14351:
URL: https://github.com/apache/arrow/pull/14351#discussion_r1046633695
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -502,6 +514,140 @@ TEST_F(TestLocalFile, OpenWithMetadata) {
ASSERT_EQ(metadata.get(), reader2->metadata().get());
}
+TEST(TestDataPageV1Checksum, CorruptPage) {
+ // works when not checking crc.
+ {
+ auto reader = ParquetFileReader::OpenFile(data_page_v1_corrupt_checksum());
+ auto metadata_ptr = reader->metadata();
+ EXPECT_EQ(1U, metadata_ptr->num_row_groups());
+ auto rg = reader->RowGroup(0);
+ auto column0 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(0));
+ auto column1 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(1));
+ EXPECT_NE(nullptr, column0);
+ EXPECT_NE(nullptr, column1);
+ const int kPageSize = 1024 * 10;
+ const int kMembers = kPageSize * 2 / sizeof(int32_t);
+ size_t column_a_size = 0;
+ size_t column_b_size = 0;
+ std::array<int32_t, 1024> values{};
+ while (column0->HasNext()) {
+ int64_t values_read;
+ int64_t real_read =
+ column0->ReadBatch(1024, nullptr, nullptr, values.data(),
&values_read);
+ EXPECT_EQ(real_read, values_read);
+ column_a_size += values_read;
+ }
+
+ while (column1->HasNext()) {
+ int64_t values_read;
+ int64_t real_read =
+ column1->ReadBatch(1024, nullptr, nullptr, values.data(),
&values_read);
+ EXPECT_EQ(real_read, values_read);
+ column_b_size += values_read;
+ }
+
+ EXPECT_EQ(kMembers, column_a_size);
+ EXPECT_EQ(kMembers, column_b_size);
+ }
+ // check crc will read failed
+ {
+ ReaderProperties readerProperties;
+ readerProperties.set_use_page_checksum_verification(true);
+ auto reader = ParquetFileReader::OpenFile(data_page_v1_corrupt_checksum(),
false,
+ readerProperties);
+ auto metadata_ptr = reader->metadata();
+ EXPECT_EQ(1U, metadata_ptr->num_row_groups());
+ auto rg = reader->RowGroup(0);
+
+ auto column0 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(0));
+ auto column1 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(1));
+ EXPECT_NE(nullptr, column0);
+ EXPECT_NE(nullptr, column1);
+
+ auto column_a_page_reader = rg->GetColumnPageReader(0);
+ auto column_b_page_reader = rg->GetColumnPageReader(1);
+
+ EXPECT_THROW(column_a_page_reader->NextPage(), ParquetException);
+ EXPECT_NE(nullptr, column_b_page_reader->NextPage());
+ EXPECT_THROW(column_b_page_reader->NextPage(), ParquetException);
+ }
+}
+
+void testCheckCrc(const std::string& local_file_name) {
+ // works when not checking crc.
+ {
Review Comment:
I think there have some different logics:
* Some enable check-crc, some are not
* Some check exception, some are not
Since there are only 3 cases, so I just paste some trivial logic
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -502,6 +514,140 @@ TEST_F(TestLocalFile, OpenWithMetadata) {
ASSERT_EQ(metadata.get(), reader2->metadata().get());
}
+TEST(TestDataPageV1Checksum, CorruptPage) {
+ // works when not checking crc.
+ {
+ auto reader = ParquetFileReader::OpenFile(data_page_v1_corrupt_checksum());
+ auto metadata_ptr = reader->metadata();
+ EXPECT_EQ(1U, metadata_ptr->num_row_groups());
+ auto rg = reader->RowGroup(0);
+ auto column0 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(0));
+ auto column1 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(1));
+ EXPECT_NE(nullptr, column0);
+ EXPECT_NE(nullptr, column1);
+ const int kPageSize = 1024 * 10;
+ const int kMembers = kPageSize * 2 / sizeof(int32_t);
+ size_t column_a_size = 0;
+ size_t column_b_size = 0;
+ std::array<int32_t, 1024> values{};
+ while (column0->HasNext()) {
+ int64_t values_read;
+ int64_t real_read =
+ column0->ReadBatch(1024, nullptr, nullptr, values.data(),
&values_read);
+ EXPECT_EQ(real_read, values_read);
+ column_a_size += values_read;
+ }
+
+ while (column1->HasNext()) {
+ int64_t values_read;
+ int64_t real_read =
+ column1->ReadBatch(1024, nullptr, nullptr, values.data(),
&values_read);
+ EXPECT_EQ(real_read, values_read);
+ column_b_size += values_read;
+ }
+
+ EXPECT_EQ(kMembers, column_a_size);
+ EXPECT_EQ(kMembers, column_b_size);
+ }
+ // check crc will read failed
+ {
+ ReaderProperties readerProperties;
+ readerProperties.set_use_page_checksum_verification(true);
+ auto reader = ParquetFileReader::OpenFile(data_page_v1_corrupt_checksum(),
false,
+ readerProperties);
+ auto metadata_ptr = reader->metadata();
+ EXPECT_EQ(1U, metadata_ptr->num_row_groups());
+ auto rg = reader->RowGroup(0);
+
+ auto column0 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(0));
+ auto column1 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(1));
+ EXPECT_NE(nullptr, column0);
+ EXPECT_NE(nullptr, column1);
+
+ auto column_a_page_reader = rg->GetColumnPageReader(0);
+ auto column_b_page_reader = rg->GetColumnPageReader(1);
+
+ EXPECT_THROW(column_a_page_reader->NextPage(), ParquetException);
+ EXPECT_NE(nullptr, column_b_page_reader->NextPage());
+ EXPECT_THROW(column_b_page_reader->NextPage(), ParquetException);
+ }
+}
+
+void testCheckCrc(const std::string& local_file_name) {
+ // works when not checking crc.
+ {
Review Comment:
I think there have some different logics:
* Some enable check-crc, some are not
* Some check exception, some are not
Since there are only 3 cases, so I just paste some trivial logic
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]