Repository: parquet-cpp Updated Branches: refs/heads/master 82515fead -> 9a0407e68
PARQUET-759: Fix handling of columns of empty strings Depends on the changes in https://github.com/apache/arrow/pull/189 Author: Uwe L. Korn <[email protected]> Closes #181 from xhochy/PARQUET-759 and squashes the following commits: 94b7054 [Uwe L. Korn] Increase Arrow hash accd787 [Uwe L. Korn] PARQUET-759: Fix handling of columns of empty strings Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/9a0407e6 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/9a0407e6 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/9a0407e6 Branch: refs/heads/master Commit: 9a0407e684c0a6299d0e6ab98c11c1162915c0ee Parents: 82515fe Author: Uwe L. Korn <[email protected]> Authored: Mon Oct 31 21:17:53 2016 -0400 Committer: Wes McKinney <[email protected]> Committed: Mon Oct 31 21:17:53 2016 -0400 ---------------------------------------------------------------------- src/parquet/arrow/arrow-reader-writer-test.cc | 25 ++++++++++++++++++++++ src/parquet/arrow/writer.cc | 10 +++++++-- thirdparty/versions.sh | 2 +- 3 files changed, 34 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/9a0407e6/src/parquet/arrow/arrow-reader-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 1f28e5c..5ec70f3 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -428,6 +428,31 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { this->ReadAndCheckSingleColumnTable(expected_values); } +using TestStringParquetIO = TestParquetIO<::arrow::StringType>; + +TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) { + std::shared_ptr<Array> values; + ::arrow::StringBuilder builder( + ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>()); + for (size_t i = 0; i < SMALL_SIZE; i++) { + builder.Append(""); + } + ASSERT_OK(builder.Finish(&values)); + std::shared_ptr<Table> table = MakeSimpleTable(values, false); + this->sink_ = std::make_shared<InMemoryOutputStream>(); + ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), ::arrow::default_memory_pool(), + this->sink_, values->length(), default_writer_properties())); + + std::shared_ptr<Table> out; + this->ReadTableFromFile(this->ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + template <typename T> using ParquetCDataType = typename ParquetDataType<T>::c_type; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/9a0407e6/src/parquet/arrow/writer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index e75d4b7..e4d3745 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -262,8 +262,14 @@ Status FileWriter::Impl::WriteFlatColumnChunk( DCHECK((offset + length) <= data->length()); RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(ByteArray))); auto buffer_ptr = reinterpret_cast<ByteArray*>(data_buffer_.mutable_data()); - auto data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data()); - DCHECK(data_ptr != nullptr); + // In the case of an array consisting of only empty strings or all null, + // data->data() points already to a nullptr, thus data->data()->data() will + // segfault. + const uint8_t* data_ptr = nullptr; + if (data->data()) { + data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data()); + DCHECK(data_ptr != nullptr); + } auto writer = reinterpret_cast<TypedColumnWriter<ByteArrayType>*>(column_writer); if (writer->descr()->max_definition_level() > 0) { RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/9a0407e6/thirdparty/versions.sh ---------------------------------------------------------------------- diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh index 87fe6b6..855b6f7 100755 --- a/thirdparty/versions.sh +++ b/thirdparty/versions.sh @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -ARROW_VERSION="676c32ccea6274c75b2750453c1ddbc5f645c037" +ARROW_VERSION="d946e7917d55cb220becd6469ae93430f2e60764" ARROW_URL="https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz" ARROW_BASEDIR="arrow-${ARROW_VERSION}"
