[ 
https://issues.apache.org/jira/browse/ARROW-2411?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16432605#comment-16432605
 ] 

ASF GitHub Bot commented on ARROW-2411:
---------------------------------------

pitrou closed pull request #1852: ARROW-2411: [C++] Add 
StringBuilder::Append(const char **values)
URL: https://github.com/apache/arrow/pull/1852
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index fb1bebfca..2ae9f48db 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1022,6 +1022,72 @@ TEST_F(TestStringBuilder, TestAppendVector) {
   }
 }
 
+TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) {
+  const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""};
+  vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1};
+
+  int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
+  int reps = 1000;
+
+  for (int j = 0; j < reps; ++j) {
+    ASSERT_OK(builder_->Append(strings, N, valid_bytes.data()));
+  }
+  Done();
+
+  ASSERT_EQ(reps * N, result_->length());
+  ASSERT_EQ(reps * 3, result_->null_count());
+  ASSERT_EQ(reps * 3, result_->value_data()->size());
+
+  int32_t length;
+  int32_t pos = 0;
+  for (int i = 0; i < N * reps; ++i) {
+    auto string = strings[i % N];
+    if (string && valid_bytes[i % N]) {
+      ASSERT_FALSE(result_->IsNull(i));
+      result_->GetValue(i, &length);
+      ASSERT_EQ(pos, result_->value_offset(i));
+      ASSERT_EQ(static_cast<int32_t>(strlen(string)), length);
+      ASSERT_EQ(strings[i % N], result_->GetString(i));
+
+      pos += length;
+    } else {
+      ASSERT_TRUE(result_->IsNull(i));
+    }
+  }
+}
+
+TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) {
+  const char* strings[] = {"", "bb", "a", nullptr, "ccc"};
+
+  int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
+  int reps = 1000;
+
+  for (int j = 0; j < reps; ++j) {
+    ASSERT_OK(builder_->Append(strings, N));
+  }
+  Done();
+
+  ASSERT_EQ(reps * N, result_->length());
+  ASSERT_EQ(reps, result_->null_count());
+  ASSERT_EQ(reps * 6, result_->value_data()->size());
+
+  int32_t length;
+  int32_t pos = 0;
+  for (int i = 0; i < N * reps; ++i) {
+    if (strings[i % N]) {
+      ASSERT_FALSE(result_->IsNull(i));
+      result_->GetValue(i, &length);
+      ASSERT_EQ(pos, result_->value_offset(i));
+      ASSERT_EQ(static_cast<int32_t>(strlen(strings[i % N])), length);
+      ASSERT_EQ(strings[i % N], result_->GetString(i));
+
+      pos += length;
+    } else {
+      ASSERT_TRUE(result_->IsNull(i));
+    }
+  }
+}
+
 TEST_F(TestStringBuilder, TestZeroLength) {
   // All buffers are null
   Done();
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index c97253e64..78c42f4fc 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -1413,6 +1413,64 @@ Status StringBuilder::Append(const 
std::vector<std::string>& values,
   return Status::OK();
 }
 
+Status StringBuilder::Append(const char** values, int64_t length,
+                             const uint8_t* valid_bytes) {
+  std::size_t total_length = 0;
+  std::vector<std::size_t> value_lengths(length);
+  bool have_null_value = false;
+  for (int64_t i = 0; i < length; ++i) {
+    if (values[i]) {
+      auto value_length = strlen(values[i]);
+      value_lengths[i] = value_length;
+      total_length += value_length;
+    } else {
+      have_null_value = true;
+    }
+  }
+  RETURN_NOT_OK(Reserve(length));
+  RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
+  RETURN_NOT_OK(offsets_builder_.Reserve(length));
+
+  if (valid_bytes) {
+    int64_t valid_bytes_offset = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      RETURN_NOT_OK(AppendNextOffset());
+      if (valid_bytes[i]) {
+        if (values[i]) {
+          RETURN_NOT_OK(value_data_builder_.Append(
+              reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
+        } else {
+          UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - 
valid_bytes_offset);
+          UnsafeAppendToBitmap(false);
+          valid_bytes_offset = i + 1;
+        }
+      }
+    }
+    UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - 
valid_bytes_offset);
+  } else {
+    if (have_null_value) {
+      std::vector<uint8_t> valid_vector(length, 0);
+      for (int64_t i = 0; i < length; ++i) {
+        RETURN_NOT_OK(AppendNextOffset());
+        if (values[i]) {
+          RETURN_NOT_OK(value_data_builder_.Append(
+              reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
+          valid_vector[i] = 1;
+        }
+      }
+      UnsafeAppendToBitmap(valid_vector.data(), length);
+    } else {
+      for (int64_t i = 0; i < length; ++i) {
+        RETURN_NOT_OK(AppendNextOffset());
+        RETURN_NOT_OK(value_data_builder_.Append(
+            reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
+      }
+      UnsafeAppendToBitmap(nullptr, length);
+    }
+  }
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // Fixed width binary
 
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index b0f77bd98..54ce1dfec 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -720,6 +720,18 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder {
 
   Status Append(const std::vector<std::string>& values,
                 const uint8_t* valid_bytes = NULLPTR);
+
+  /// \brief Append a sequence of nul-terminated strings in one shot.
+  ///        If one of the values is NULL, it is processed as a null
+  ///        value even if the corresponding valid_bytes entry is 1.
+  ///
+  /// \param[in] values a contiguous C array of nul-terminated char *
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status Append(const char** values, int64_t length,
+                const uint8_t* valid_bytes = NULLPTR);
 };
 
 // ----------------------------------------------------------------------


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Add method to append batches of null-terminated strings to StringBuilder
> ------------------------------------------------------------------------------
>
>                 Key: ARROW-2411
>                 URL: https://issues.apache.org/jira/browse/ARROW-2411
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: C++, GLib
>            Reporter: Uwe L. Korn
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.10.0
>
>
> We should add a method {{StringBuilder::AppendCStrings(const char** values, 
> const uint8_t* valid_bytes = NULLPTR)}} to the {{StringBuilder}} class to 
> have fast inserts for these strings. See 
> https://github.com/apache/arrow/pull/1845/files for a use case.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to