pitrou commented on a change in pull request #11281:
URL: https://github.com/apache/arrow/pull/11281#discussion_r721424568
##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -3896,6 +3896,41 @@ INSTANTIATE_TEST_SUITE_P(WriteDictionary,
TestArrowWriteDictionary,
::testing::Values(ParquetDataPageVersion::V1,
ParquetDataPageVersion::V2));
+TEST(TestArrowWriteNestedDictionary, TestNullCount) {
+ std::shared_ptr<::arrow::Array> dictionary = ArrayFromJSON(::arrow::utf8(),
R"(["z"])");
+ std::shared_ptr<::arrow::Array> indices =
+ ArrayFromJSON(::arrow::int32(), R"([0, null, 0])");
+ std::shared_ptr<::arrow::Array> offsets =
+ ArrayFromJSON(::arrow::int32(), R"([null, 0, null, null, null, 3])");
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<::arrow::Array> dict_encoded,
+ ::arrow::DictionaryArray::FromArrays(indices,
dictionary));
+ // values looks like [null, ["z", null, "z"], null, null, null]
+ ASSERT_OK_AND_ASSIGN(
+ std::shared_ptr<Array> values,
+ ::arrow::ListArray::FromArrays(*offsets, *dict_encoded,
default_memory_pool()));
+ std::shared_ptr<::arrow::Schema> schema =
+ ::arrow::schema({::arrow::field("values", values->type())});
+ std::shared_ptr<::arrow::Table> table = ::arrow::Table::Make(schema,
{values});
+
+ std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
+ auto out_stream =
std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
+ std::unique_ptr<FileWriter> writer;
+ ASSERT_OK(FileWriter::Open(*table->schema(), default_memory_pool(),
out_stream,
+ default_writer_properties(),
+ default_arrow_writer_properties(), &writer));
+ ASSERT_OK(writer->WriteTable(*table, std::numeric_limits<int64_t>::max()));
+ ASSERT_OK(writer->Close());
+ if (!out_stream->closed()) {
Review comment:
It is probably not necessary to test for `out_stream->closed()`, is it?
##########
File path: cpp/src/parquet/statistics.h
##########
@@ -282,8 +282,8 @@ class TypedStatistics : public Statistics {
/// \brief Batch statistics update with supplied validity bitmap
virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
Review comment:
Can you add a description of the parameters to the docstring? It is not
immediately obvious what `num_spaced_values` is...
##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -4191,5 +4226,107 @@ TEST(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
}
#endif
+struct StatisticsTestParam {
+ std::shared_ptr<::arrow::Table> table;
+ int expected_null_count;
+ // This is the non-null count and not the num_values in the page headers.
+ int expected_value_count;
+ std::string expected_min;
+ std::string expected_max;
+};
+
+class ParameterizedStatisticsTest : public
::testing::TestWithParam<StatisticsTestParam> {
+};
+
+std::string GetManyEmptyLists() {
+ std::string many_empty_lists = "[";
+ for (int i = 0; i < 2000; ++i) {
+ many_empty_lists += "[],";
+ }
+ many_empty_lists += "[1,2,3,4,5,6,7,8,null]]";
+ return many_empty_lists;
+}
+
+TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
+ std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
+ auto out_stream =
std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
+ std::unique_ptr<FileWriter> writer;
+ ASSERT_OK(FileWriter::Open(*GetParam().table->schema(),
default_memory_pool(),
+ out_stream, default_writer_properties(),
+ default_arrow_writer_properties(), &writer));
+ ASSERT_OK(writer->WriteTable(*GetParam().table,
std::numeric_limits<int64_t>::max()));
+ ASSERT_OK(writer->Close());
+ if (!out_stream->closed()) {
+ ASSERT_OK(out_stream->Close());
+ }
+
+ auto buffer_reader =
std::make_shared<::arrow::io::BufferReader>(serialized_data);
+ auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
+ std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
+ std::shared_ptr<Statistics> stats =
metadata->RowGroup(0)->ColumnChunk(0)->statistics();
+ EXPECT_EQ(stats->null_count(), GetParam().expected_null_count);
+ EXPECT_EQ(stats->num_values(), GetParam().expected_value_count);
+ ASSERT_TRUE(stats->HasMinMax());
+ EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min);
+ EXPECT_EQ(stats->EncodeMax(), GetParam().expected_max);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ StatsTests, ParameterizedStatisticsTest,
+ ::testing::Values(
+ StatisticsTestParam{/*table=*/Table::Make(
+ ::arrow::schema({::arrow::field("a",
::arrow::utf8())}),
+ {ArrayFromJSON(::arrow::utf8(), R"(["1", null,
"3"])")}),
+ /*expected=*/1, /* empty list counts as null as
well */
Review comment:
Should be `/*expected_null_count=*/`
##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -3896,6 +3896,41 @@ INSTANTIATE_TEST_SUITE_P(WriteDictionary,
TestArrowWriteDictionary,
::testing::Values(ParquetDataPageVersion::V1,
ParquetDataPageVersion::V2));
+TEST(TestArrowWriteNestedDictionary, TestNullCount) {
+ std::shared_ptr<::arrow::Array> dictionary = ArrayFromJSON(::arrow::utf8(),
R"(["z"])");
+ std::shared_ptr<::arrow::Array> indices =
+ ArrayFromJSON(::arrow::int32(), R"([0, null, 0])");
+ std::shared_ptr<::arrow::Array> offsets =
+ ArrayFromJSON(::arrow::int32(), R"([null, 0, null, null, null, 3])");
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<::arrow::Array> dict_encoded,
+ ::arrow::DictionaryArray::FromArrays(indices,
dictionary));
+ // values looks like [null, ["z", null, "z"], null, null, null]
+ ASSERT_OK_AND_ASSIGN(
+ std::shared_ptr<Array> values,
+ ::arrow::ListArray::FromArrays(*offsets, *dict_encoded,
default_memory_pool()));
Review comment:
You can directly create of list-of-dictionary array using
`ArrayFromJSON`. For example:
```c++
auto type = list(dictionary(int32(), utf8()));
auto values = ArrayFromJSON(type, R"([null, ["z", null, "z"], null, null,
null])");
```
##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -4191,5 +4226,107 @@ TEST(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
}
#endif
+struct StatisticsTestParam {
Review comment:
Can you perhaps create a separate `statistics_test.cc`? This test file
is getting much too long already.
##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -4191,5 +4226,107 @@ TEST(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
}
#endif
+struct StatisticsTestParam {
+ std::shared_ptr<::arrow::Table> table;
+ int expected_null_count;
+ // This is the non-null count and not the num_values in the page headers.
+ int expected_value_count;
+ std::string expected_min;
+ std::string expected_max;
+};
+
+class ParameterizedStatisticsTest : public
::testing::TestWithParam<StatisticsTestParam> {
+};
+
+std::string GetManyEmptyLists() {
+ std::string many_empty_lists = "[";
+ for (int i = 0; i < 2000; ++i) {
+ many_empty_lists += "[],";
+ }
+ many_empty_lists += "[1,2,3,4,5,6,7,8,null]]";
+ return many_empty_lists;
+}
+
+TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
+ std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
Review comment:
Would be nice to add a comment with a reference to the JIRA ticket.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]