This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 8149c39027 GH-39560: [C++][Parquet] Add integration test for
BYTE_STREAM_SPLIT (#39570)
8149c39027 is described below
commit 8149c390276c2f4d4e0031cd162b4498825f9062
Author: Antoine Pitrou <[email protected]>
AuthorDate: Thu Jan 11 19:40:45 2024 +0100
GH-39560: [C++][Parquet] Add integration test for BYTE_STREAM_SPLIT (#39570)
### Rationale for this change
In https://github.com/apache/parquet-testing/pull/45 , an integration file
for BYTE_STREAM_SPLIT was added to the parquet-testing repo.
### What changes are included in this PR?
Add a test reading that file and ensuring the decoded values are as
expected.
### Are these changes tested?
By definition.
### Are there any user-facing changes?
No.
* Closes: #39560
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/reader_test.cc | 53 +++++++++++++++++++++++++++++++++++++++---
cpp/submodules/parquet-testing | 2 +-
2 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 2c2b62f5d1..551f62798e 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -120,11 +120,27 @@ std::string concatenated_gzip_members() {
return data_file("concatenated_gzip_members.parquet");
}
+std::string byte_stream_split() { return
data_file("byte_stream_split.zstd.parquet"); }
+
+template <typename DType, typename ValueType = typename DType::c_type>
+std::vector<ValueType> ReadColumnValues(ParquetFileReader* file_reader, int
row_group,
+ int column, int64_t
expected_values_read) {
+ auto column_reader = checked_pointer_cast<TypedColumnReader<DType>>(
+ file_reader->RowGroup(row_group)->Column(column));
+ std::vector<ValueType> values(expected_values_read);
+ int64_t values_read;
+ auto levels_read = column_reader->ReadBatch(expected_values_read, nullptr,
nullptr,
+ values.data(), &values_read);
+ EXPECT_EQ(expected_values_read, levels_read);
+ EXPECT_EQ(expected_values_read, values_read);
+ return values;
+}
+
// TODO: Assert on definition and repetition levels
-template <typename DType, typename ValueType>
+template <typename DType, typename ValueType = typename DType::c_type>
void AssertColumnValues(std::shared_ptr<TypedColumnReader<DType>> col, int64_t
batch_size,
int64_t expected_levels_read,
- std::vector<ValueType>& expected_values,
+ const std::vector<ValueType>& expected_values,
int64_t expected_values_read) {
std::vector<ValueType> values(batch_size);
int64_t values_read;
@@ -1412,7 +1428,6 @@ TEST_P(TestCodec, LargeFileValues) {
// column 0 ("a")
auto col = checked_pointer_cast<ByteArrayReader>(group->Column(0));
-
std::vector<ByteArray> values(kNumRows);
int64_t values_read;
auto levels_read =
@@ -1474,6 +1489,38 @@ TEST(TestFileReader, TestOverflowInt16PageOrdinal) {
}
}
+#ifdef ARROW_WITH_ZSTD
+TEST(TestByteStreamSplit, FloatIntegrationFile) {
+ auto file_path = byte_stream_split();
+ auto file = ParquetFileReader::OpenFile(file_path);
+
+ const int64_t kNumRows = 300;
+
+ ASSERT_EQ(kNumRows, file->metadata()->num_rows());
+ ASSERT_EQ(2, file->metadata()->num_columns());
+ ASSERT_EQ(1, file->metadata()->num_row_groups());
+
+ // column 0 ("f32")
+ {
+ auto values =
+ ReadColumnValues<FloatType>(file.get(), /*row_group=*/0, /*column=*/0,
kNumRows);
+ ASSERT_EQ(values[0], 1.7640524f);
+ ASSERT_EQ(values[1], 0.4001572f);
+ ASSERT_EQ(values[kNumRows - 2], -0.39944902f);
+ ASSERT_EQ(values[kNumRows - 1], 0.37005588f);
+ }
+ // column 1 ("f64")
+ {
+ auto values =
+ ReadColumnValues<DoubleType>(file.get(), /*row_group=*/0,
/*column=*/1, kNumRows);
+ ASSERT_EQ(values[0], -1.3065268517353166);
+ ASSERT_EQ(values[1], 1.658130679618188);
+ ASSERT_EQ(values[kNumRows - 2], -0.9301565025243212);
+ ASSERT_EQ(values[kNumRows - 1], -0.17858909208732915);
+ }
+}
+#endif // ARROW_WITH_ZSTD
+
struct PageIndexReaderParam {
std::vector<int32_t> row_group_indices;
std::vector<int32_t> column_indices;
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index d69d979223..4cb3cff24c 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit d69d979223e883faef9dc6fe3cf573087243c28a
+Subproject commit 4cb3cff24c965fb329cdae763eabce47395a68a0