Repository: parquet-cpp Updated Branches: refs/heads/master 2e0c28e5d -> 2a4fab5a2
PARQUET-1121: Handle Dictionary[Null] arrays on writing Arrow tables I will fix the underlying issue in Arrow but this fixes the issue so we can get a 1.3.1 release out soon. Author: Korn, Uwe <[email protected]> Closes #407 from xhochy/PARQUET-1121 and squashes the following commits: 85223b9 [Korn, Uwe] PARQUET-1121: Handle Dictionary[Null] arrays on writing Arrow tables Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/2a4fab5a Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/2a4fab5a Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/2a4fab5a Branch: refs/heads/master Commit: 2a4fab5a2263b55a631c83aedea0c6b993b1b1c9 Parents: 2e0c28e Author: Korn, Uwe <[email protected]> Authored: Sat Oct 7 15:55:43 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Sat Oct 7 15:55:43 2017 -0400 ---------------------------------------------------------------------- src/parquet/arrow/arrow-reader-writer-test.cc | 28 ++++++++++++++++++++++ src/parquet/arrow/writer.cc | 6 +++++ 2 files changed, 34 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2a4fab5a/src/parquet/arrow/arrow-reader-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 4fd57ea..fc6410d 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -926,6 +926,34 @@ TEST_F(TestNullParquetIO, NullColumn) { internal::AssertArraysEqual(*values, *chunked_array->chunk(0)); } +TEST_F(TestNullParquetIO, NullDictionaryColumn) { + std::shared_ptr<Array> values = std::make_shared<::arrow::NullArray>(0); + std::shared_ptr<Array> indices = + std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, nullptr, SMALL_SIZE); + std::shared_ptr<::arrow::DictionaryType> dict_type = + std::make_shared<::arrow::DictionaryType>(::arrow::int8(), values); + std::shared_ptr<Array> dict_values = + std::make_shared<::arrow::DictionaryArray>(dict_type, indices); + std::shared_ptr<Table> table = MakeSimpleTable(dict_values, true); + this->sink_ = std::make_shared<InMemoryOutputStream>(); + ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, + dict_values->length(), default_writer_properties())); + + std::shared_ptr<Table> out; + std::unique_ptr<FileReader> reader; + this->ReaderFromSink(&reader); + this->ReadTableFromFile(std::move(reader), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + + std::shared_ptr<Array> expected_values = + std::make_shared<::arrow::NullArray>(SMALL_SIZE); + AssertArraysEqual(*expected_values, *chunked_array->chunk(0)); +} + template <typename T> using ParquetCDataType = typename ParquetDataType<T>::c_type; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2a4fab5a/src/parquet/arrow/writer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index e834042..b53c1ca 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -819,6 +819,12 @@ Status FileWriter::Impl::WriteColumnChunk(const Array& data) { const ::arrow::DictionaryType& dict_type = static_cast<const ::arrow::DictionaryType&>(*data.type()); + // TODO(ARROW-1648): Remove this special handling once we require an Arrow + // version that has this fixed. + if (dict_type.dictionary()->type()->id() == ::arrow::Type::NA) { + return WriteColumnChunk(::arrow::NullArray(data.length())); + } + FunctionContext ctx(pool_); std::shared_ptr<Array> plain_array; RETURN_NOT_OK(
