tachyonwill commented on a change in pull request #11281:
URL: https://github.com/apache/arrow/pull/11281#discussion_r721740296



##########
File path: cpp/src/parquet/statistics.h
##########
@@ -282,8 +282,8 @@ class TypedStatistics : public Statistics {
 
   /// \brief Batch statistics update with supplied validity bitmap
   virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,

Review comment:
       Done.

##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -3896,6 +3896,41 @@ INSTANTIATE_TEST_SUITE_P(WriteDictionary, 
TestArrowWriteDictionary,
                          ::testing::Values(ParquetDataPageVersion::V1,
                                            ParquetDataPageVersion::V2));
 
+TEST(TestArrowWriteNestedDictionary, TestNullCount) {
+  std::shared_ptr<::arrow::Array> dictionary = ArrayFromJSON(::arrow::utf8(), 
R"(["z"])");
+  std::shared_ptr<::arrow::Array> indices =
+      ArrayFromJSON(::arrow::int32(), R"([0, null, 0])");
+  std::shared_ptr<::arrow::Array> offsets =
+      ArrayFromJSON(::arrow::int32(), R"([null, 0, null, null, null, 3])");
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<::arrow::Array> dict_encoded,
+                       ::arrow::DictionaryArray::FromArrays(indices, 
dictionary));
+  // values looks like [null, ["z", null, "z"], null, null, null]
+  ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<Array> values,
+      ::arrow::ListArray::FromArrays(*offsets, *dict_encoded, 
default_memory_pool()));

Review comment:
       done.

##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -4191,5 +4226,107 @@ TEST(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
 }
 #endif
 
+struct StatisticsTestParam {
+  std::shared_ptr<::arrow::Table> table;
+  int expected_null_count;
+  // This is the non-null count and not the num_values in the page headers.
+  int expected_value_count;
+  std::string expected_min;
+  std::string expected_max;
+};
+
+class ParameterizedStatisticsTest : public 
::testing::TestWithParam<StatisticsTestParam> {
+};
+
+std::string GetManyEmptyLists() {
+  std::string many_empty_lists = "[";
+  for (int i = 0; i < 2000; ++i) {
+    many_empty_lists += "[],";
+  }
+  many_empty_lists += "[1,2,3,4,5,6,7,8,null]]";
+  return many_empty_lists;
+}
+
+TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
+  std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
+  auto out_stream = 
std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
+  std::unique_ptr<FileWriter> writer;
+  ASSERT_OK(FileWriter::Open(*GetParam().table->schema(), 
default_memory_pool(),
+                             out_stream, default_writer_properties(),
+                             default_arrow_writer_properties(), &writer));
+  ASSERT_OK(writer->WriteTable(*GetParam().table, 
std::numeric_limits<int64_t>::max()));
+  ASSERT_OK(writer->Close());
+  if (!out_stream->closed()) {
+    ASSERT_OK(out_stream->Close());
+  }
+
+  auto buffer_reader = 
std::make_shared<::arrow::io::BufferReader>(serialized_data);
+  auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
+  std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
+  std::shared_ptr<Statistics> stats = 
metadata->RowGroup(0)->ColumnChunk(0)->statistics();
+  EXPECT_EQ(stats->null_count(), GetParam().expected_null_count);
+  EXPECT_EQ(stats->num_values(), GetParam().expected_value_count);
+  ASSERT_TRUE(stats->HasMinMax());
+  EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min);
+  EXPECT_EQ(stats->EncodeMax(), GetParam().expected_max);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    StatsTests, ParameterizedStatisticsTest,
+    ::testing::Values(
+        StatisticsTestParam{/*table=*/Table::Make(
+                                ::arrow::schema({::arrow::field("a", 
::arrow::utf8())}),
+                                {ArrayFromJSON(::arrow::utf8(), R"(["1", null, 
"3"])")}),
+                            /*expected=*/1, /* empty list counts as null as 
well */

Review comment:
       done.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to