kou commented on code in PR #13386:
URL: https://github.com/apache/arrow/pull/13386#discussion_r904239076
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_TRUE(std::memcmp(val.ptr, expected.c_str(), val.len) == 0);
Review Comment:
How about using string (string view) object for better error message?
```suggestion
ASSERT_EQ(::arrow::util::string_view(reinterpret_cast<const
char*>(val.ptr), val.len),
expected);
```
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_TRUE(std::memcmp(val.ptr, expected.c_str(), val.len) == 0);
+ }
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestTextDeltaLengthByteArray, TestBatchRead) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayReader> col =
+ std::dynamic_pointer_cast<ByteArrayReader>(group->Column(0));
+
+ int16_t def_levels[25];
+ int16_t rep_levels[25];
+ ByteArray values[25];
+
+ // This file only has 1000 rows
+ ASSERT_EQ(1000, reader_->metadata()->num_rows());
+ // This file only has 1 row group
+ ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+ // Size of the metadata is 400 bytes
Review Comment:
```suggestion
// Size of the metadata is 105 bytes
```
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -57,6 +57,9 @@ std::string data_file(const char* file) {
return ss.str();
}
+std::string text_delta_len_byte_array() {
+ return data_file("delta_length_byte_array.parquet");
+}
Review Comment:
We don't need to define this because we just use this in only one place. We
can write this directly in `SetUp()`.
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_EQ(std::memcmp(val.ptr, expected.c_str(), val.len), 0);
+ }
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestTextDeltaLengthByteArray, TestBatchRead) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayReader> col =
+ std::dynamic_pointer_cast<ByteArrayReader>(group->Column(0));
+
+ int16_t def_levels[25];
Review Comment:
Could you add `int64_t batch_size = 25;` and use it instead of putting many
`25` literals?
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_TRUE(std::memcmp(val.ptr, expected.c_str(), val.len) == 0);
+ }
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestTextDeltaLengthByteArray, TestBatchRead) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayReader> col =
+ std::dynamic_pointer_cast<ByteArrayReader>(group->Column(0));
+
+ int16_t def_levels[25];
+ int16_t rep_levels[25];
+ ByteArray values[25];
+
+ // This file only has 1000 rows
+ ASSERT_EQ(1000, reader_->metadata()->num_rows());
+ // This file only has 1 row group
+ ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+ // Size of the metadata is 400 bytes
+ ASSERT_EQ(105, reader_->metadata()->size());
+ // This row group must have 1000 rows
+ ASSERT_EQ(1000, group->metadata()->num_rows());
+
+ // Check if the column is encoded with DELTA_LENGTH_BYTE_ARRAY
+ std::unique_ptr<ColumnChunkMetaData> col_chunk =
group->metadata()->ColumnChunk(0);
Review Comment:
We can use `auto` here:
```suggestion
auto col_chunk = group->metadata()->ColumnChunk(0);
```
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_EQ(std::memcmp(val.ptr, expected.c_str(), val.len), 0);
+ }
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestTextDeltaLengthByteArray, TestBatchRead) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
Review Comment:
Can we use `auto` here?
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
Review Comment:
Can we use `auto` here?
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_TRUE(std::memcmp(val.ptr, expected.c_str(), val.len) == 0);
+ }
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestTextDeltaLengthByteArray, TestBatchRead) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayReader> col =
+ std::dynamic_pointer_cast<ByteArrayReader>(group->Column(0));
+
+ int16_t def_levels[25];
+ int16_t rep_levels[25];
+ ByteArray values[25];
+
+ // This file only has 1000 rows
+ ASSERT_EQ(1000, reader_->metadata()->num_rows());
+ // This file only has 1 row group
+ ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+ // Size of the metadata is 400 bytes
+ ASSERT_EQ(105, reader_->metadata()->size());
+ // This row group must have 1000 rows
+ ASSERT_EQ(1000, group->metadata()->num_rows());
+
+ // Check if the column is encoded with DELTA_LENGTH_BYTE_ARRAY
+ std::unique_ptr<ColumnChunkMetaData> col_chunk =
group->metadata()->ColumnChunk(0);
+
+ ASSERT_TRUE(std::find(col_chunk->encodings().begin(),
col_chunk->encodings().end(),
+ Encoding::DELTA_LENGTH_BYTE_ARRAY) !=
+ col_chunk->encodings().end());
+
+ ASSERT_TRUE(col->HasNext());
+ int64_t values_read = 0;
+ int64_t curr_batch_read;
+ std::string expected_prefix("apple_banana_mango");
+ while (values_read < 1000) {
+ auto levels_read =
+ col->ReadBatch(25, def_levels, rep_levels, values, &curr_batch_read);
+ ASSERT_EQ(25, levels_read);
+ ASSERT_EQ(25, curr_batch_read);
+ for (int i = 0; i < 25; i++) {
+ std::string expected =
Review Comment:
We can use `auto` here.
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_EQ(std::memcmp(val.ptr, expected.c_str(), val.len), 0);
+ }
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestTextDeltaLengthByteArray, TestBatchRead) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayReader> col =
Review Comment:
Can we use `auto` here?
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +130,85 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestTextDeltaLengthByteArray : public ::testing::Test {
+ public:
+ void SetUp() { reader_ =
ParquetFileReader::OpenFile(text_delta_len_byte_array()); }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestTextDeltaLengthByteArray, TestTextScanner) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayScanner> scanner(new
ByteArrayScanner(group->Column(0)));
+ ByteArray val;
+ bool is_null;
+ std::string expected_prefix("apple_banana_mango");
+ for (int i = 0; i < 1000; ++i) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+ ASSERT_FALSE(is_null);
+ std::string expected = expected_prefix + std::to_string(i * i);
+ ASSERT_TRUE(val.len == expected.length());
+ ASSERT_TRUE(std::memcmp(val.ptr, expected.c_str(), val.len) == 0);
+ }
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestTextDeltaLengthByteArray, TestBatchRead) {
+ std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
+
+ // column 0, id
+ std::shared_ptr<ByteArrayReader> col =
+ std::dynamic_pointer_cast<ByteArrayReader>(group->Column(0));
+
+ int16_t def_levels[25];
+ int16_t rep_levels[25];
+ ByteArray values[25];
+
+ // This file only has 1000 rows
+ ASSERT_EQ(1000, reader_->metadata()->num_rows());
+ // This file only has 1 row group
+ ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+ // Size of the metadata is 400 bytes
+ ASSERT_EQ(105, reader_->metadata()->size());
+ // This row group must have 1000 rows
+ ASSERT_EQ(1000, group->metadata()->num_rows());
+
+ // Check if the column is encoded with DELTA_LENGTH_BYTE_ARRAY
+ std::unique_ptr<ColumnChunkMetaData> col_chunk =
group->metadata()->ColumnChunk(0);
+
+ ASSERT_TRUE(std::find(col_chunk->encodings().begin(),
col_chunk->encodings().end(),
+ Encoding::DELTA_LENGTH_BYTE_ARRAY) !=
+ col_chunk->encodings().end());
+
+ ASSERT_TRUE(col->HasNext());
+ int64_t values_read = 0;
+ int64_t curr_batch_read;
+ std::string expected_prefix("apple_banana_mango");
+ while (values_read < 1000) {
+ auto levels_read =
+ col->ReadBatch(25, def_levels, rep_levels, values, &curr_batch_read);
+ ASSERT_EQ(25, levels_read);
+ ASSERT_EQ(25, curr_batch_read);
+ for (int i = 0; i < 25; i++) {
+ std::string expected =
+ expected_prefix + std::to_string((i + values_read) * (i +
values_read));
+ ASSERT_TRUE(values[i].len == expected.length());
+ ASSERT_TRUE(std::memcmp(values[i].ptr, expected.c_str(), values[i].len)
== 0);
Review Comment:
How about using string view here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]