[
https://issues.apache.org/jira/browse/ARROW-1808?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16261734#comment-16261734
]
ASF GitHub Bot commented on ARROW-1808:
---------------------------------------
kou commented on issue #1337: ARROW-1808: [C++] Make RecordBatch, Table virtual
interfaces for column access
URL: https://github.com/apache/arrow/pull/1337#issuecomment-346203089
> It seems like respecting the schema is the right approach.
I agree with you.
> I could add boundschecking to `SimpleRecordBatch::column(i)` and return
null if the index is out of bounds, would that help at all?
I think that it's better that we do it in more higher layer such as GLib
bindings layer. I think that we don't do needless checks in C++ layer for
simplicity and performance.
I'll add boundschecking in GLib bindings later. For now, I think that it's
better that we always validate a newly created record batch. If we always
validate it, we can assume that all record batches always have valid data.
```patch
From d9260c09765b1cd337cda5a09497ee1b985ef623 Mon Sep 17 00:00:00 2001
From: Kouhei Sutou <[email protected]>
Date: Wed, 22 Nov 2017 09:09:30 +0900
Subject: [PATCH] [GLib] Always validate on creating new record batch
---
c_glib/arrow-glib/record-batch.cpp | 13 ++++++++---
c_glib/arrow-glib/record-batch.h | 3 ++-
c_glib/example/go/write-batch.go | 10 +++++++--
c_glib/example/go/write-stream.go | 10 +++++++--
c_glib/test/test-record-batch.rb | 46
+++++++++++++++++++++++++-------------
5 files changed, 58 insertions(+), 24 deletions(-)
diff --git a/c_glib/arrow-glib/record-batch.cpp
b/c_glib/arrow-glib/record-batch.cpp
index f23a0cf7..73de6eeb 100644
--- a/c_glib/arrow-glib/record-batch.cpp
+++ b/c_glib/arrow-glib/record-batch.cpp
@@ -135,13 +135,15 @@ garrow_record_batch_class_init(GArrowRecordBatchClass
*klass)
* @schema: The schema of the record batch.
* @n_rows: The number of the rows in the record batch.
* @columns: (element-type GArrowArray): The columns in the record batch.
+ * @error: (nullable): Return location for a #GError or %NULL.
*
- * Returns: A newly created #GArrowRecordBatch.
+ * Returns: (nullable): A newly created #GArrowRecordBatch or %NULL on
error.
*/
GArrowRecordBatch *
garrow_record_batch_new(GArrowSchema *schema,
guint32 n_rows,
- GList *columns)
+ GList *columns,
+ GError **error)
{
std::vector<std::shared_ptr<arrow::Array>> arrow_columns;
for (GList *node = columns; node; node = node->next) {
@@ -152,7 +154,12 @@ garrow_record_batch_new(GArrowSchema *schema,
auto arrow_record_batch =
arrow::RecordBatch::Make(garrow_schema_get_raw(schema),
n_rows, arrow_columns);
- return garrow_record_batch_new_raw(&arrow_record_batch);
+ auto status = arrow_record_batch->Validate();
+ if (garrow_error_check(error, status, "[record-batch][new]")) {
+ return garrow_record_batch_new_raw(&arrow_record_batch);
+ } else {
+ return NULL;
+ }
}
/**
diff --git a/c_glib/arrow-glib/record-batch.h
b/c_glib/arrow-glib/record-batch.h
index 021f894f..823a42bb 100644
--- a/c_glib/arrow-glib/record-batch.h
+++ b/c_glib/arrow-glib/record-batch.h
@@ -68,7 +68,8 @@ GType garrow_record_batch_get_type(void) G_GNUC_CONST;
GArrowRecordBatch *garrow_record_batch_new(GArrowSchema *schema,
guint32 n_rows,
- GList *columns);
+ GList *columns,
+ GError **error);
gboolean garrow_record_batch_equal(GArrowRecordBatch *record_batch,
GArrowRecordBatch *other_record_batch);
diff --git a/c_glib/example/go/write-batch.go
b/c_glib/example/go/write-batch.go
index 9dbc3c00..f4d03ed9 100644
--- a/c_glib/example/go/write-batch.go
+++ b/c_glib/example/go/write-batch.go
@@ -188,7 +188,10 @@ func main() {
BuildDoubleArray(),
}
- recordBatch := arrow.NewRecordBatch(schema, 4, columns)
+ recordBatch, err := arrow.NewRecordBatch(schema, 4, columns)
+ if err != nil {
+ log.Fatalf("Failed to create record batch #1: %v", err)
+ }
_, err = writer.WriteRecordBatch(recordBatch)
if err != nil {
log.Fatalf("Failed to write record batch #1: %v", err)
@@ -198,7 +201,10 @@ func main() {
for i, column := range columns {
slicedColumns[i] = column.Slice(1, 3)
}
- recordBatch = arrow.NewRecordBatch(schema, 3, slicedColumns)
+ recordBatch, err = arrow.NewRecordBatch(schema, 3, slicedColumns)
+ if err != nil {
+ log.Fatalf("Failed to create record batch #2: %v", err)
+ }
_, err = writer.WriteRecordBatch(recordBatch)
if err != nil {
log.Fatalf("Failed to write record batch #2: %v", err)
diff --git a/c_glib/example/go/write-stream.go
b/c_glib/example/go/write-stream.go
index 244741e8..7225156a 100644
--- a/c_glib/example/go/write-stream.go
+++ b/c_glib/example/go/write-stream.go
@@ -188,7 +188,10 @@ func main() {
BuildDoubleArray(),
}
- recordBatch := arrow.NewRecordBatch(schema, 4, columns)
+ recordBatch, err := arrow.NewRecordBatch(schema, 4, columns)
+ if err != nil {
+ log.Fatalf("Failed to create record batch #1: %v", err)
+ }
_, err = writer.WriteRecordBatch(recordBatch)
if err != nil {
log.Fatalf("Failed to write record batch #1: %v", err)
@@ -198,7 +201,10 @@ func main() {
for i, column := range columns {
slicedColumns[i] = column.Slice(1, 3)
}
- recordBatch = arrow.NewRecordBatch(schema, 3, slicedColumns)
+ recordBatch, err = arrow.NewRecordBatch(schema, 3, slicedColumns)
+ if err != nil {
+ log.Fatalf("Failed to create record batch #2: %v", err)
+ }
writer.WriteRecordBatch(recordBatch)
_, err = writer.WriteRecordBatch(recordBatch)
if err != nil {
diff --git a/c_glib/test/test-record-batch.rb
b/c_glib/test/test-record-batch.rb
index 9fd34b7d..325944b8 100644
--- a/c_glib/test/test-record-batch.rb
+++ b/c_glib/test/test-record-batch.rb
@@ -18,18 +18,32 @@
class TestTable < Test::Unit::TestCase
include Helper::Buildable
- def test_new
- fields = [
- Arrow::Field.new("visible", Arrow::BooleanDataType.new),
- Arrow::Field.new("valid", Arrow::BooleanDataType.new),
- ]
- schema = Arrow::Schema.new(fields)
- columns = [
- build_boolean_array([true]),
- build_boolean_array([false]),
- ]
- record_batch = Arrow::RecordBatch.new(schema, 1, columns)
- assert_equal(1, record_batch.n_rows)
+ sub_test_case(".new") do
+ def test_valid
+ fields = [
+ Arrow::Field.new("visible", Arrow::BooleanDataType.new),
+ Arrow::Field.new("valid", Arrow::BooleanDataType.new),
+ ]
+ schema = Arrow::Schema.new(fields)
+ columns = [
+ build_boolean_array([true]),
+ build_boolean_array([false]),
+ ]
+ record_batch = Arrow::RecordBatch.new(schema, 1, columns)
+ assert_equal(1, record_batch.n_rows)
+ end
+
+ def test_no_columns
+ fields = [
+ Arrow::Field.new("visible", Arrow::BooleanDataType.new),
+ ]
+ schema = Arrow::Schema.new(fields)
+ message = "[record-batch][new]: " +
+ "Invalid: Number of columns did not match schema"
+ assert_raise(Arrow::Error::Invalid.new(message)) do
+ Arrow::RecordBatch.new(schema, 0, [])
+ end
+ end
end
sub_test_case("instance methods") do
@@ -40,7 +54,7 @@ class TestTable < Test::Unit::TestCase
]
schema = Arrow::Schema.new(fields)
columns = [
- build_boolean_array([true, false, true, false, true, false]),
+ build_boolean_array([true, false, true, false, true]),
build_boolean_array([false, true, false, true, false]),
]
@record_batch = Arrow::RecordBatch.new(schema, 5, columns)
@@ -53,7 +67,7 @@ class TestTable < Test::Unit::TestCase
]
schema = Arrow::Schema.new(fields)
columns = [
- build_boolean_array([true, false, true, false, true, false]),
+ build_boolean_array([true, false, true, false, true]),
build_boolean_array([false, true, false, true, false]),
]
other_record_batch = Arrow::RecordBatch.new(schema, 5, columns)
@@ -71,7 +85,7 @@ class TestTable < Test::Unit::TestCase
end
def test_columns
- assert_equal([6, 5],
+ assert_equal([5, 5],
@record_batch.columns.collect(&:length))
end
@@ -94,7 +108,7 @@ class TestTable < Test::Unit::TestCase
def test_to_s
assert_equal(<<-PRETTY_PRINT, @record_batch.to_s)
-visible: [true, false, true, false, true, false]
+visible: [true, false, true, false, true]
valid: [false, true, false, true, false]
PRETTY_PRINT
end
--
2.15.0
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [C++] Make RecordBatch interface virtual to permit record batches that
> lazy-materialize columns
> -----------------------------------------------------------------------------------------------
>
> Key: ARROW-1808
> URL: https://issues.apache.org/jira/browse/ARROW-1808
> Project: Apache Arrow
> Issue Type: Improvement
> Components: C++
> Reporter: Wes McKinney
> Assignee: Wes McKinney
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> This should be looked at soon to prevent having to define a different virtual
> interface for record batches. There are places where we are using the record
> batch constructor directly, and in some third party code (like MapD), so this
> might be good to get done for 0.8.0
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)