emkornfield commented on a change in pull request #10729:
URL: https://github.com/apache/arrow/pull/10729#discussion_r671601430



##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -3796,6 +3796,95 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) {
   ASSERT_RAISES(Invalid, writer->WriteTable(*tbl, 1));
 }
 
+// ----------------------------------------------------------------------
+// Tests for directly writing DictionaryArray
+class TestArrowWriteDictionary : public ::testing::Test {
+ public:
+  void SetUp() override {
+    properties_ = default_arrow_writer_properties();
+    serialized_data_ = AllocateBuffer();
+  }
+
+  // Generates a range of single character strings from start to end 
(inclusive)
+  // Nulls will be inserted starting at index 0.  If there are any null
+  // values then start will not be the true min.  However, the dictionary
+  // array will always contain all the values even if they aren't all used.
+  void GenerateRange(int num_nulls, char start, char end) {
+    int num_values = end - start + 1;
+    DCHECK_LT(num_nulls, num_values);
+
+    ::arrow::StringBuilder dictionary_builder;
+    ::arrow::Int32Builder indices_builder;
+
+    for (char val = start; val <= end; val++) {
+      int32_t index = static_cast<int32_t>(val - start);
+      if (val - start >= num_nulls) {
+        ASSERT_OK(indices_builder.Append(index));
+      } else {
+        ASSERT_OK(indices_builder.AppendNull());
+      }
+      ASSERT_OK(dictionary_builder.Append(&val, 1));
+    }
+
+    std::shared_ptr<::arrow::Array> dictionary;
+    std::shared_ptr<::arrow::Array> indices;
+    ASSERT_OK(dictionary_builder.Finish(&dictionary));
+    ASSERT_OK(indices_builder.Finish(&indices));
+
+    ASSERT_OK_AND_ASSIGN(auto test_column,
+                         ::arrow::DictionaryArray::FromArrays(indices, 
dictionary));
+
+    auto schema = ::arrow::schema({::arrow::field("values", 
test_column->type())});
+    auto batch = ::arrow::RecordBatch::Make(schema, num_values, {test_column});
+    ASSERT_OK_AND_ASSIGN(test_data_, 
::arrow::Table::FromRecordBatches({batch}));
+  }
+
+  void WriteToBuffer() {
+    auto out_stream = 
std::make_shared<::arrow::io::BufferOutputStream>(serialized_data_);
+    auto writer_properties = default_writer_properties();
+    std::unique_ptr<FileWriter> writer;
+    ASSERT_OK(FileWriter::Open(*test_data_->schema(), 
::arrow::default_memory_pool(),
+                               out_stream, writer_properties,
+                               default_arrow_writer_properties(), &writer));
+    ASSERT_OK(writer->WriteTable(*test_data_, 
std::numeric_limits<int64_t>::max()));
+    ASSERT_OK(writer->Close());
+  }
+
+  std::shared_ptr<FileMetaData> LoadWrittenMetadata() {
+    auto buffer_reader = 
std::make_shared<::arrow::io::BufferReader>(serialized_data_);
+    auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
+    return parquet_reader->metadata();
+  }
+
+  void CheckNullCount() {
+    GenerateRange(5, 'a', 'z');
+    WriteToBuffer();
+    auto metadata = LoadWrittenMetadata();
+    auto stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
+    ASSERT_TRUE(stats->HasNullCount());
+    ASSERT_EQ(stats->null_count(), 5);
+    ASSERT_EQ(stats->num_values(), 21);
+  }
+
+  void CheckMinMax() {
+    GenerateRange(5, 'a', 'z');
+    WriteToBuffer();
+    auto metadata = LoadWrittenMetadata();

Review comment:
       I think I might be the only person that calls people on it, I think most 
active developers will use auto alot.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to