pitrou commented on code in PR #14351:
URL: https://github.com/apache/arrow/pull/14351#discussion_r1048536801
##########
cpp/src/parquet/file_deserialize_test.cc:
##########
@@ -315,6 +325,127 @@ TEST_F(TestPageSerde, LZONotSupported) {
ASSERT_THROW(InitSerializedPageReader(data_size, Compression::LZO),
ParquetException);
}
+TEST_F(TestPageSerde, CrcEnabled) {
+ auto codec_types = GetSupportedCodecTypes();
+ codec_types.push_back(Compression::UNCOMPRESSED);
+ const int32_t num_rows = 32; // dummy value
+ data_page_header_.num_values = num_rows;
+
+ const int num_pages = 10;
+
+ std::vector<std::vector<uint8_t>> faux_data;
+ faux_data.resize(num_pages);
+ for (int i = 0; i < num_pages; ++i) {
+ // The pages keep getting larger
+ int page_size = (i + 1) * 64;
+ test::random_bytes(page_size, 0, &faux_data[i]);
+ }
+ for (auto codec_type : codec_types) {
+ auto codec = GetCodec(codec_type);
+
+ std::vector<uint8_t> buffer;
+ for (int i = 0; i < num_pages; ++i) {
+ const uint8_t* data = faux_data[i].data();
+ int data_size = static_cast<int>(faux_data[i].size());
+ int64_t actual_size;
+ if (codec == nullptr) {
+ buffer = faux_data[i];
+ actual_size = data_size;
+ } else {
+ int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data);
+ buffer.resize(max_compressed_size);
+
+ ASSERT_OK_AND_ASSIGN(
+ actual_size,
+ codec->Compress(data_size, data, max_compressed_size, &buffer[0]));
+ }
+
+ uint32_t checksum =
+ ::arrow::internal::crc32(/* prev */ 0, buffer.data(), actual_size);
+ ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(
+ 1024, data_size, static_cast<int32_t>(actual_size), checksum));
+ ASSERT_OK(out_stream_->Write(buffer.data(), actual_size));
+ }
+ ReaderProperties readerProperties;
+ readerProperties.set_data_page_checksum_verification(true);
+ InitSerializedPageReader(num_rows * num_pages, codec_type,
readerProperties);
+
+ std::shared_ptr<Page> page;
+ const DataPageV1* data_page;
+ for (int i = 0; i < num_pages; ++i) {
+ int data_size = static_cast<int>(faux_data[i].size());
+ page = page_reader_->NextPage();
+ data_page = static_cast<const DataPageV1*>(page.get());
+ ASSERT_EQ(data_size, data_page->size());
+ ASSERT_EQ(0, memcmp(faux_data[i].data(), data_page->data(), data_size));
+ }
+
+ ResetStream();
+ }
+}
+
+TEST_F(TestPageSerde, CrcNotExists) {
+ int stats_size = 512;
+ const int32_t num_rows = 4444;
+ AddDummyStats(stats_size, data_page_header_, /* fill_all_stats = */ true);
+ data_page_header_.num_values = num_rows;
+
+ ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader());
+ ReaderProperties readerProperties;
+ readerProperties.set_data_page_checksum_verification(true);
+ InitSerializedPageReader(num_rows, Compression::UNCOMPRESSED,
readerProperties);
+ std::shared_ptr<Page> current_page = page_reader_->NextPage();
+ ASSERT_NO_FATAL_FAILURE(CheckDataPageHeader(data_page_header_,
current_page.get()));
+}
+
+TEST_F(TestPageSerde, CrcCorrupt) {
+ auto codec_types = GetSupportedCodecTypes();
+ codec_types.push_back(Compression::UNCOMPRESSED);
+ const int32_t num_rows = 32; // dummy value
+ data_page_header_.num_values = num_rows;
+
+ std::vector<uint8_t> faux_data;
+
+ // The pages keep getting larger
+ int page_size = 6400;
+ test::random_bytes(page_size, 0, &faux_data);
+
+ for (auto codec_type : codec_types) {
+ auto codec = GetCodec(codec_type);
+
+ std::vector<uint8_t> buffer;
+
+ const uint8_t* data = faux_data.data();
+ int data_size = static_cast<int>(faux_data.size());
+ int64_t actual_size;
+ if (codec == nullptr) {
+ buffer = faux_data;
+ actual_size = data_size;
+ } else {
+ int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data);
+ buffer.resize(max_compressed_size);
+
+ ASSERT_OK_AND_ASSIGN(
+ actual_size, codec->Compress(data_size, data, max_compressed_size,
&buffer[0]));
+ }
+
+ uint32_t wrong_checksum =
+ ::arrow::internal::crc32(/* prev */ 0, buffer.data(), actual_size) + 1;
+ ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(
+ 1024, data_size, static_cast<int32_t>(actual_size), wrong_checksum));
+ ASSERT_OK(out_stream_->Write(buffer.data(), actual_size));
+
+ ReaderProperties readerProperties;
+ readerProperties.set_data_page_checksum_verification(true);
+ InitSerializedPageReader(1, codec_type, readerProperties);
Review Comment:
This is mostly copy-pasted from `CrcEnabled` above, can you create helper
methods to factor out such boilerplate?
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -502,6 +514,160 @@ TEST_F(TestLocalFile, OpenWithMetadata) {
ASSERT_EQ(metadata.get(), reader2->metadata().get());
}
+// https://github.com/google/googletest/pull/2904 not available in our version
of
+// gtest/gmock
Review Comment:
The user might have installed a more recent version of GTest, so I would
guard this with a `#ifdef`.
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -502,6 +514,160 @@ TEST_F(TestLocalFile, OpenWithMetadata) {
ASSERT_EQ(metadata.get(), reader2->metadata().get());
}
+// https://github.com/google/googletest/pull/2904 not available in our version
of
+// gtest/gmock
+#define EXPECT_THROW_THAT(callable, ex_type, property) \
+ EXPECT_THROW( \
+ try { (callable)(); } catch (const ex_type& err) { \
+ EXPECT_THAT(err, (property)); \
+ throw; \
+ }, \
+ ex_type)
+
+TEST(TestDataPageV1Checksum, CorruptPage) {
+ // works when not checking crc.
+ {
+ auto reader = ParquetFileReader::OpenFile(data_page_v1_corrupt_checksum());
+ auto metadata_ptr = reader->metadata();
+ EXPECT_EQ(1U, metadata_ptr->num_row_groups());
+ auto rg = reader->RowGroup(0);
+ auto column0 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(0));
+ auto column1 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(1));
+ EXPECT_NE(nullptr, column0);
+ EXPECT_NE(nullptr, column1);
Review Comment:
This is all boilerplate, perhaps factor it out in a test fixture?
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -502,6 +514,160 @@ TEST_F(TestLocalFile, OpenWithMetadata) {
ASSERT_EQ(metadata.get(), reader2->metadata().get());
}
+// https://github.com/google/googletest/pull/2904 not available in our version
of
+// gtest/gmock
+#define EXPECT_THROW_THAT(callable, ex_type, property) \
+ EXPECT_THROW( \
+ try { (callable)(); } catch (const ex_type& err) { \
+ EXPECT_THAT(err, (property)); \
+ throw; \
+ }, \
+ ex_type)
+
+TEST(TestDataPageV1Checksum, CorruptPage) {
+ // works when not checking crc.
+ {
+ auto reader = ParquetFileReader::OpenFile(data_page_v1_corrupt_checksum());
+ auto metadata_ptr = reader->metadata();
+ EXPECT_EQ(1U, metadata_ptr->num_row_groups());
+ auto rg = reader->RowGroup(0);
+ auto column0 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(0));
+ auto column1 =
std::dynamic_pointer_cast<TypedColumnReader<Int32Type>>(rg->Column(1));
+ EXPECT_NE(nullptr, column0);
+ EXPECT_NE(nullptr, column1);
+ const int kPageSize = 1024 * 10;
+ const int kMembers = kPageSize * 2 / sizeof(int32_t);
Review Comment:
Why `* 2`? Because there are two pages per column?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]