[ 
https://issues.apache.org/jira/browse/ARROW-2457?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16446808#comment-16446808
 ] 

ASF GitHub Bot commented on ARROW-2457:
---------------------------------------

xhochy closed pull request #1921: ARROW-2457: [GLib] Support large is_valids in 
builder's append_values()
URL: https://github.com/apache/arrow/pull/1921
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/c_glib/arrow-glib/array-builder.cpp 
b/c_glib/arrow-glib/array-builder.cpp
index f129998815..343615bd90 100644
--- a/c_glib/arrow-glib/array-builder.cpp
+++ b/c_glib/arrow-glib/array-builder.cpp
@@ -42,7 +42,7 @@ garrow_array_builder_append(GArrowArrayBuilder *builder,
 template <typename BUILDER, typename VALUE>
 gboolean
 garrow_array_builder_append_values(GArrowArrayBuilder *builder,
-                                   const VALUE *values,
+                                   VALUE *values,
                                    gint64 values_length,
                                    const gboolean *is_valids,
                                    gint64 is_valids_length,
@@ -65,13 +65,37 @@ garrow_array_builder_append_values(GArrowArrayBuilder 
*builder,
                   is_valids_length);
       return FALSE;
     }
-    uint8_t valid_bytes[is_valids_length];
-    for (gint64 i = 0; i < is_valids_length; ++i) {
-      valid_bytes[i] = is_valids[i];
+
+    const gint64 chunk_size = 4096;
+    gint64 n_chunks = is_valids_length / chunk_size;
+    gint64 n_remains = is_valids_length % chunk_size;
+    for (gint64 i = 0; i < n_chunks; ++i) {
+      uint8_t valid_bytes[chunk_size];
+      gint64 offset = chunk_size * i;
+      const gboolean *chunked_is_valids = is_valids + offset;
+      for (gint64 j = 0; j < chunk_size; ++j) {
+        valid_bytes[j] = chunked_is_valids[j];
+      }
+      status = arrow_builder->AppendValues(values + offset,
+                                           chunk_size,
+                                           valid_bytes);
+      if (!garrow_error_check(error, status, context)) {
+        return FALSE;
+      }
+    }
+    {
+      uint8_t valid_bytes[n_remains];
+      gint64 offset = chunk_size * n_chunks;
+      const gboolean *chunked_is_valids = is_valids + offset;
+      for (gint64 i = 0; i < n_remains; ++i) {
+        valid_bytes[i] = chunked_is_valids[i];
+      }
+      status = arrow_builder->AppendValues(values + offset,
+                                           n_remains,
+                                           valid_bytes);
     }
-    status = arrow_builder->Append(values, values_length, valid_bytes);
   } else {
-    status = arrow_builder->Append(values, values_length, nullptr);
+    status = arrow_builder->AppendValues(values, values_length, nullptr);
   }
   return garrow_error_check(error, status, context);
 }
@@ -2212,37 +2236,14 @@ 
garrow_string_array_builder_append_values(GArrowStringArrayBuilder *builder,
                                           gint64 is_valids_length,
                                           GError **error)
 {
-  const char *context = "[string-array-builder][append-values]";
-  auto arrow_builder =
-    static_cast<arrow::StringBuilder *>(
-      garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder)));
-
-  if (is_valids_length > 0) {
-    if (values_length != is_valids_length) {
-      g_set_error(error,
-                  GARROW_ERROR,
-                  GARROW_ERROR_INVALID,
-                  "%s: values length and is_valids length must be equal: "
-                  "<%" G_GINT64_FORMAT "> != "
-                  "<%" G_GINT64_FORMAT ">",
-                  context,
-                  values_length,
-                  is_valids_length);
-      return FALSE;
-    }
-  }
-
-  if (is_valids_length > 0) {
-    uint8_t valid_bytes[is_valids_length];
-    for (gint64 i = 0; i < values_length; ++i) {
-      valid_bytes[i] = is_valids[i];
-    }
-    auto status = arrow_builder->Append(values, values_length, valid_bytes);
-    return garrow_error_check(error, status, context);
-  } else {
-    auto status = arrow_builder->Append(values, values_length, nullptr);
-    return garrow_error_check(error, status, context);
-  }
+  return garrow_array_builder_append_values<arrow::StringBuilder *>
+    (GARROW_ARRAY_BUILDER(builder),
+     values,
+     values_length,
+     is_valids,
+     is_valids_length,
+     error,
+     "[string-array-builder][append-values]");
 }
 
 
diff --git a/c_glib/test/test-array-builder.rb 
b/c_glib/test/test-array-builder.rb
index 6811102ca0..f67cfecf5e 100644
--- a/c_glib/test/test-array-builder.rb
+++ b/c_glib/test/test-array-builder.rb
@@ -41,6 +41,19 @@ def test_with_is_valids
                  builder.finish)
   end
 
+  def test_with_large_is_valids
+    builder = create_builder
+    n = 10000
+    large_sample_values = sample_values * n
+    large_is_valids = [true, true, false] * n
+    builder.append_values(large_sample_values, large_is_valids)
+    sample_values_with_null = sample_values
+    sample_values_with_null[2] = nil
+    large_sample_values_with_null = sample_values_with_null * n
+    assert_equal(build_array(large_sample_values_with_null),
+                 builder.finish)
+  end
+
   def test_mismatch_length
     builder = create_builder
     message = "[#{builder_class_name}][append-values]: " +


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> garrow_array_builder_append_values() won't work for large arrays
> ----------------------------------------------------------------
>
>                 Key: ARROW-2457
>                 URL: https://issues.apache.org/jira/browse/ARROW-2457
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C, C++, GLib
>    Affects Versions: 0.8.0, 0.9.0
>            Reporter: Haralampos Gavriilidis
>            Assignee: Kouhei Sutou
>            Priority: Major
>              Labels: pull-request-available
>
> I am using garrow_array_builder_append_values() to transform a native C array 
> to an Arrow array, without calling arrow_array_builder_append multiple times. 
> When calling garrow_array_builder_append_values() in array-builder.cpp with 
> following signature:
> {code:java}
> garrow_array_builder_append_values(GArrowArrayBuilder *builder,
> const VALUE *values,
> gint64 values_length,
> const gboolean *is_valids,
> gint64 is_valids_length,
> GError **error,
> const gchar *context)
> {code}
> it will fail for large arrays. This is probably happening because the 
> is_valids array is copied to the valid_bytes array (of different type), for 
> which the memory is allocated on the stack, and not on the heap, like shown 
> on the snippet below:
> {code:java}
> uint8_t valid_bytes[is_valids_length];
> for (gint64 i = 0; i < is_valids_length; ++i){ 
>   valid_bytes[i] = is_valids[i]; 
> }
> {code}
>  A way to avoid this problem would be to allocate memory for the valid_bytes 
> array using malloc() or something similar. Is this behavior intended, maybe 
> because no large arrays should be handed over to that function, or it is 
> rather a bug?



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to