emkornfield commented on a change in pull request #10729:
URL: https://github.com/apache/arrow/pull/10729#discussion_r671403080



##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -3796,6 +3796,95 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) {
   ASSERT_RAISES(Invalid, writer->WriteTable(*tbl, 1));
 }
 
+// ----------------------------------------------------------------------
+// Tests for directly writing DictionaryArray
+class TestArrowWriteDictionary : public ::testing::Test {
+ public:
+  void SetUp() override {
+    properties_ = default_arrow_writer_properties();
+    serialized_data_ = AllocateBuffer();
+  }
+
+  // Generates a range of single character strings from start to end 
(inclusive)
+  // Nulls will be inserted starting at index 0.  If there are any null
+  // values then start will not be the true min.  However, the dictionary
+  // array will always contain all the values even if they aren't all used.
+  void GenerateRange(int num_nulls, char start, char end) {
+    int num_values = end - start + 1;
+    DCHECK_LT(num_nulls, num_values);
+
+    ::arrow::StringBuilder dictionary_builder;
+    ::arrow::Int32Builder indices_builder;
+
+    for (char val = start; val <= end; val++) {
+      int32_t index = static_cast<int32_t>(val - start);
+      if (val - start >= num_nulls) {
+        ASSERT_OK(indices_builder.Append(index));
+      } else {
+        ASSERT_OK(indices_builder.AppendNull());
+      }
+      ASSERT_OK(dictionary_builder.Append(&val, 1));
+    }
+
+    std::shared_ptr<::arrow::Array> dictionary;
+    std::shared_ptr<::arrow::Array> indices;
+    ASSERT_OK(dictionary_builder.Finish(&dictionary));
+    ASSERT_OK(indices_builder.Finish(&indices));
+
+    ASSERT_OK_AND_ASSIGN(auto test_column,
+                         ::arrow::DictionaryArray::FromArrays(indices, 
dictionary));
+
+    auto schema = ::arrow::schema({::arrow::field("values", 
test_column->type())});
+    auto batch = ::arrow::RecordBatch::Make(schema, num_values, {test_column});
+    ASSERT_OK_AND_ASSIGN(test_data_, 
::arrow::Table::FromRecordBatches({batch}));
+  }
+
+  void WriteToBuffer() {
+    auto out_stream = 
std::make_shared<::arrow::io::BufferOutputStream>(serialized_data_);
+    auto writer_properties = default_writer_properties();
+    std::unique_ptr<FileWriter> writer;
+    ASSERT_OK(FileWriter::Open(*test_data_->schema(), 
::arrow::default_memory_pool(),
+                               out_stream, writer_properties,
+                               default_arrow_writer_properties(), &writer));
+    ASSERT_OK(writer->WriteTable(*test_data_, 
std::numeric_limits<int64_t>::max()));
+    ASSERT_OK(writer->Close());
+  }
+
+  std::shared_ptr<FileMetaData> LoadWrittenMetadata() {
+    auto buffer_reader = 
std::make_shared<::arrow::io::BufferReader>(serialized_data_);
+    auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
+    return parquet_reader->metadata();
+  }
+
+  void CheckNullCount() {
+    GenerateRange(5, 'a', 'z');
+    WriteToBuffer();
+    auto metadata = LoadWrittenMetadata();
+    auto stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
+    ASSERT_TRUE(stats->HasNullCount());
+    ASSERT_EQ(stats->null_count(), 5);
+    ASSERT_EQ(stats->num_values(), 21);
+  }
+
+  void CheckMinMax() {
+    GenerateRange(5, 'a', 'z');
+    WriteToBuffer();
+    auto metadata = LoadWrittenMetadata();

Review comment:
       Please generally careful with the use of auto.  The [style guide 
recommends it just of saving 
space](https://google.github.io/styleguide/cppguide.html#Type_deduction)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to