This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 91ec792 ARROW-2411: [C++] Add StringBuilder::Append(const char **values) 91ec792 is described below commit 91ec7928fc112883d4c693059bc7371789a7fe0d Author: Kouhei Sutou <k...@clear-code.com> AuthorDate: Tue Apr 10 19:14:38 2018 +0200 ARROW-2411: [C++] Add StringBuilder::Append(const char **values) This implementation uses not NULL-terminated C strings and the length of values instead of NULL-terminated C strings. Because other builder uses the interface such as `PrimitiveBuilder::Append()`. Author: Kouhei Sutou <k...@clear-code.com> Author: Antoine Pitrou <pit...@free.fr> Closes #1852 from kou/cpp-string-builder-append-cstrings and squashes the following commits: df00afc <Antoine Pitrou> Nit: fix docstring wording ed8203a <Kouhei Sutou> Process NULL char * as a null value 6d4669b <Kouhei Sutou> Fix expected value d697e49 <Kouhei Sutou> Format 001ea51 <Kouhei Sutou> Process nullptr char * as a NULL value 0a11f7b <Kouhei Sutou> Use "nul-terminated char *" instead of "C string" f06fd76 <Kouhei Sutou> Add StringBuilder::Append(const char **values) --- cpp/src/arrow/array-test.cc | 66 +++++++++++++++++++++++++++++++++++++++++++++ cpp/src/arrow/builder.cc | 58 +++++++++++++++++++++++++++++++++++++++ cpp/src/arrow/builder.h | 12 +++++++++ 3 files changed, 136 insertions(+) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index fb1bebf..2ae9f48 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1022,6 +1022,72 @@ TEST_F(TestStringBuilder, TestAppendVector) { } } +TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) { + const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""}; + vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1}; + + int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); + int reps = 1000; + + for (int j = 0; j < reps; ++j) { + ASSERT_OK(builder_->Append(strings, N, valid_bytes.data())); + } + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps * 3, result_->null_count()); + ASSERT_EQ(reps * 3, result_->value_data()->size()); + + int32_t length; + int32_t pos = 0; + for (int i = 0; i < N * reps; ++i) { + auto string = strings[i % N]; + if (string && valid_bytes[i % N]) { + ASSERT_FALSE(result_->IsNull(i)); + result_->GetValue(i, &length); + ASSERT_EQ(pos, result_->value_offset(i)); + ASSERT_EQ(static_cast<int32_t>(strlen(string)), length); + ASSERT_EQ(strings[i % N], result_->GetString(i)); + + pos += length; + } else { + ASSERT_TRUE(result_->IsNull(i)); + } + } +} + +TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) { + const char* strings[] = {"", "bb", "a", nullptr, "ccc"}; + + int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); + int reps = 1000; + + for (int j = 0; j < reps; ++j) { + ASSERT_OK(builder_->Append(strings, N)); + } + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->value_data()->size()); + + int32_t length; + int32_t pos = 0; + for (int i = 0; i < N * reps; ++i) { + if (strings[i % N]) { + ASSERT_FALSE(result_->IsNull(i)); + result_->GetValue(i, &length); + ASSERT_EQ(pos, result_->value_offset(i)); + ASSERT_EQ(static_cast<int32_t>(strlen(strings[i % N])), length); + ASSERT_EQ(strings[i % N], result_->GetString(i)); + + pos += length; + } else { + ASSERT_TRUE(result_->IsNull(i)); + } + } +} + TEST_F(TestStringBuilder, TestZeroLength) { // All buffers are null Done(); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index c97253e..78c42f4 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1413,6 +1413,64 @@ Status StringBuilder::Append(const std::vector<std::string>& values, return Status::OK(); } +Status StringBuilder::Append(const char** values, int64_t length, + const uint8_t* valid_bytes) { + std::size_t total_length = 0; + std::vector<std::size_t> value_lengths(length); + bool have_null_value = false; + for (int64_t i = 0; i < length; ++i) { + if (values[i]) { + auto value_length = strlen(values[i]); + value_lengths[i] = value_length; + total_length += value_length; + } else { + have_null_value = true; + } + } + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); + RETURN_NOT_OK(offsets_builder_.Reserve(length)); + + if (valid_bytes) { + int64_t valid_bytes_offset = 0; + for (int64_t i = 0; i < length; ++i) { + RETURN_NOT_OK(AppendNextOffset()); + if (valid_bytes[i]) { + if (values[i]) { + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i])); + } else { + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); + UnsafeAppendToBitmap(false); + valid_bytes_offset = i + 1; + } + } + } + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); + } else { + if (have_null_value) { + std::vector<uint8_t> valid_vector(length, 0); + for (int64_t i = 0; i < length; ++i) { + RETURN_NOT_OK(AppendNextOffset()); + if (values[i]) { + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i])); + valid_vector[i] = 1; + } + } + UnsafeAppendToBitmap(valid_vector.data(), length); + } else { + for (int64_t i = 0; i < length; ++i) { + RETURN_NOT_OK(AppendNextOffset()); + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i])); + } + UnsafeAppendToBitmap(nullptr, length); + } + } + return Status::OK(); +} + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index b0f77bd..54ce1df 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -720,6 +720,18 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { Status Append(const std::vector<std::string>& values, const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status Append(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); }; // ---------------------------------------------------------------------- -- To stop receiving notification emails like this one, please contact apit...@apache.org.