This is an automated email from the ASF dual-hosted git repository.
sgilmore pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 8515a08b92 GH-46877: [MATLAB] Add
`arrow.tabular.Table.fromRecordBatches` static method (#46885)
8515a08b92 is described below
commit 8515a08b92b92ea6ba7ca6588ed336487322fb43
Author: Sarah Gilmore <[email protected]>
AuthorDate: Mon Jun 23 15:43:48 2025 -0400
GH-46877: [MATLAB] Add `arrow.tabular.Table.fromRecordBatches` static
method (#46885)
### Rationale for this change
This change makes it possible to create an `arrow.tabular.Table` instance
from a list of `arrow.tabular.RecordBatch` instances whose `Schema`s are
consistent.
### What changes are included in this PR?
Added a new static method called `fromRecordBatches` to the MATLAB class
`arrow.tabular.Table`. This method should construct an `arrow.tabular.Table`
from a variable number of `arrow.tabular.RecordBatch`es.
**Usage Example**
```matlab
>> rb1 = arrow.recordBatch(table([1:5]', [6:10]'));
>> rb2 = arrow.recordBatch(table([11:15]', [16:20]'));
>> table = arrow.tabular.Table.fromRecordBatches(rb1, rb2)
table =
Arrow Table with 10 rows and 2 columns:
Schema:
Var1: Float64 | Var2: Float64
First Row:
1 | 6
```
**Error Message Examples**
```matlab
% Error message when fromRecordBatches is called with zero input arguments
>> arrow.tabular.Table.fromRecordBatches()
Error using arrow.tabular.Table.fromRecordBatches (line 154)
The fromRecordBatches method requires at least one RecordBatch to be
supplied.
% Error message when fromRecordBatches is given RecordBatches whose Schemas
are inconsistent.
>> rb1 = arrow.recordBatch(table(1, 2, VariableNames=["Num1", "Num2"]));
>> rb2 = arrow.recordBatch(table(1, "A", VariableNames=["Num1",
"Letter1"]));
>> arrow.tabular.Table.fromRecordBatches(rb1, rb2)
Error using arrow.tabular.Table.fromRecordBatches (line 167)
All RecordBatches must have the same Schema.
Schema of RecordBatch 2 is
Num1: Float64 | Letter1: String
Expected RecordBatch Schema to be
Num1: Float64 | Num2: Float64
```
### Are these changes tested?
Yes. Added four new test cases to the MATLAB test class `tTable`:
1. `FromRecordBatchesZeroInputsError`
2. `FromRecordBatchesOneInput`
3. `FromRecordBatchesMultipleInputs`
4. `FromRecordBatchesInconsistentSchemaError`
### Are there any user-facing changes?
Yes. Users can now create an `arrow.tabular.Table` instance via the static
method `fromRecordBatches`.
* GitHub Issue: #46877
Lead-authored-by: Sarah Gilmore <[email protected]>
Co-authored-by: Sarah Gilmore <[email protected]>
Co-authored-by: Kevin Gurney <[email protected]>
Signed-off-by: Sarah Gilmore <[email protected]>
---
matlab/src/cpp/arrow/matlab/error/error.h | 1 +
matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc | 71 +++++++++++++++++++---
matlab/src/matlab/+arrow/+tabular/Table.m | 33 +++++++++-
matlab/src/matlab/+arrow/table.m | 9 +--
matlab/test/arrow/tabular/tTable.m | 51 ++++++++++++++++
5 files changed, 149 insertions(+), 16 deletions(-)
diff --git a/matlab/src/cpp/arrow/matlab/error/error.h
b/matlab/src/cpp/arrow/matlab/error/error.h
index 47bde56dac..74b8d5fd41 100644
--- a/matlab/src/cpp/arrow/matlab/error/error.h
+++ b/matlab/src/cpp/arrow/matlab/error/error.h
@@ -253,5 +253,6 @@ static const char* IPC_RECORD_BATCH_READ_INVALID_INDEX =
"arrow:io:ipc:InvalidIn
static const char* IPC_RECORD_BATCH_READ_FAILED = "arrow:io:ipc:ReadFailed";
static const char* IPC_TABLE_READ_FAILED = "arrow:io:ipc:TableReadFailed";
static const char* IPC_END_OF_STREAM = "arrow:io:ipc:EndOfStream";
+static const char* TABLE_MAKE_UNKNOWN_METHOD = "arrow:table:UnknownMakeMethod";
} // namespace arrow::matlab::error
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
index d7e31de4e7..ff7de96e34 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
@@ -18,11 +18,13 @@
#include "libmexclass/proxy/ProxyManager.h"
#include "arrow/matlab/array/proxy/array.h"
+
#include "arrow/matlab/array/proxy/chunked_array.h"
#include "arrow/matlab/array/proxy/wrap.h"
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/tabular/get_row_as_string.h"
+#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/tabular/proxy/schema.h"
#include "arrow/matlab/tabular/proxy/table.h"
@@ -34,6 +36,8 @@
namespace arrow::matlab::tabular::proxy {
+namespace mda = ::matlab::data;
+
namespace {
libmexclass::error::Error makeEmptyTableError() {
const std::string error_msg =
@@ -70,7 +74,6 @@ Table::Table(std::shared_ptr<arrow::Table> table) :
table{table} {
std::shared_ptr<arrow::Table> Table::unwrap() { return table; }
void Table::toString(libmexclass::proxy::method::Context& context) {
- namespace mda = ::matlab::data;
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_string,
arrow::util::UTF8StringToUTF16(table->ToString()),
context,
error::UNICODE_CONVERSION_ERROR_ID);
@@ -79,12 +82,11 @@ void Table::toString(libmexclass::proxy::method::Context&
context) {
context.outputs[0] = str_mda;
}
-libmexclass::proxy::MakeResult Table::make(
- const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+namespace {
+libmexclass::proxy::MakeResult from_arrays(const mda::StructArray& opts) {
using ArrayProxy = arrow::matlab::array::proxy::Array;
using TableProxy = arrow::matlab::tabular::proxy::Table;
- namespace mda = ::matlab::data;
- mda::StructArray opts = constructor_arguments[0];
+
const mda::TypedArray<uint64_t> arrow_array_proxy_ids =
opts[0]["ArrayProxyIDs"];
const mda::StringArray column_names = opts[0]["ColumnNames"];
@@ -114,9 +116,64 @@ libmexclass::proxy::MakeResult Table::make(
error::SCHEMA_BUILDER_FINISH_ERROR_ID);
const auto num_rows = arrow_arrays.size() == 0 ? 0 :
arrow_arrays[0]->length();
const auto table = arrow::Table::Make(schema, arrow_arrays, num_rows);
- auto table_proxy = std::make_shared<TableProxy>(table);
+ return std::make_shared<TableProxy>(table);
+}
+
+libmexclass::proxy::MakeResult from_record_batches(const mda::StructArray&
opts) {
+ using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch;
+ using TableProxy = arrow::matlab::tabular::proxy::Table;
- return table_proxy;
+ size_t num_rows = 0;
+ const mda::TypedArray<uint64_t> record_batch_proxy_ids =
opts[0]["RecordBatchProxyIDs"];
+
+ std::vector<std::shared_ptr<arrow::RecordBatch>> record_batches;
+ // Retrieve all of the Arrow RecordBatch Proxy instances from the libmexclass
+ // ProxyManager.
+ for (const auto& proxy_id : record_batch_proxy_ids) {
+ auto proxy = libmexclass::proxy::ProxyManager::getProxy(proxy_id);
+ auto record_batch_proxy = std::static_pointer_cast<RecordBatch>(proxy);
+ auto record_batch = record_batch_proxy->unwrap();
+ record_batches.push_back(record_batch);
+ num_rows += record_batches.back()->num_rows();
+ }
+
+ // The MATLAB client code that calls this function is responsible for
pre-validating
+ // that this function is called with at least one RecordBatch.
+ auto schema = record_batches[0]->schema();
+ size_t num_columns = schema->num_fields();
+ std::vector<std::shared_ptr<ChunkedArray>> columns(num_columns);
+
+ size_t num_batches = record_batches.size();
+
+ for (size_t i = 0; i < num_columns; ++i) {
+ std::vector<std::shared_ptr<Array>> column_arrays(num_batches);
+ for (size_t j = 0; j < num_batches; ++j) {
+ column_arrays[j] = record_batches[j]->column(i);
+ }
+ columns[i] = std::make_shared<ChunkedArray>(column_arrays,
schema->field(i)->type());
+ }
+ const auto table = arrow::Table::Make(std::move(schema), std::move(columns),
num_rows);
+ return std::make_shared<TableProxy>(table);
+}
+} // anonymous namespace
+
+libmexclass::proxy::MakeResult Table::make(
+ const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+ mda::StructArray opts = constructor_arguments[0];
+ const mda::StringArray method = opts[0]["Method"];
+
+ if (method[0] == u"from_arrays") {
+ return from_arrays(opts);
+ } else if (method[0] == u"from_record_batches") {
+ return from_record_batches(opts);
+ } else {
+ const auto method_name_utf16 = std::u16string(method[0]);
+ MATLAB_ASSIGN_OR_ERROR(const auto method_name_utf8,
+ arrow::util::UTF16StringToUTF8(method_name_utf16),
+ error::UNICODE_CONVERSION_ERROR_ID);
+ const std::string error_msg = "Unknown make method: " + method_name_utf8;
+ return libmexclass::error::Error{error::TABLE_MAKE_UNKNOWN_METHOD,
error_msg};
+ }
}
void Table::getNumRows(libmexclass::proxy::method::Context& context) {
diff --git a/matlab/src/matlab/+arrow/+tabular/Table.m
b/matlab/src/matlab/+arrow/+tabular/Table.m
index 1ed205d639..48e6b09edc 100644
--- a/matlab/src/matlab/+arrow/+tabular/Table.m
+++ b/matlab/src/matlab/+arrow/+tabular/Table.m
@@ -139,7 +139,38 @@ classdef Table < matlab.mixin.CustomDisplay &
matlab.mixin.Scalar
validateColumnNames(opts.ColumnNames, numColumns);
arrayProxyIDs = getArrayProxyIDs(arrowArrays);
- args = struct(ArrayProxyIDs=arrayProxyIDs,
ColumnNames=opts.ColumnNames);
+ args = struct(Method="from_arrays", ArrayProxyIDs=arrayProxyIDs,
ColumnNames=opts.ColumnNames);
+ proxyName = "arrow.tabular.proxy.Table";
+ proxy = arrow.internal.proxy.create(proxyName, args);
+ arrowTable = arrow.tabular.Table(proxy);
+ end
+
+ function arrowTable = fromRecordBatches(batches)
+ arguments(Repeating)
+ batches(1, 1) arrow.tabular.RecordBatch
+ end
+ if numel(batches) == 0
+ msg = "The fromRecordBatches method requires at least one
RecordBatch to be supplied.";
+ error("arrow:Table:FromRecordBatches:ZeroBatches", msg);
+ elseif numel(batches) > 1
+ % Verify that all supplied RecordBatches have a consistent
Schema.
+ firstSchema = batches{1}.Schema;
+ otherSchemas = cellfun(@(rb) rb.Schema, batches(2:end),
UniformOutput=false);
+ idx = cellfun(@(other) ~isequal(firstSchema, other),
otherSchemas, UniformOutput=true);
+ inconsistentSchemaIndex = find(idx, 1,"first");
+ if ~isempty(inconsistentSchemaIndex)
+ inconsistentSchemaIndex = inconsistentSchemaIndex + 1;
+ expectedSchema =
arrow.tabular.internal.display.getSchemaString(firstSchema);
+ inconsistentSchema =
arrow.tabular.internal.display.getSchemaString(batches{inconsistentSchemaIndex}.Schema);
+ msg = "All RecordBatches must have the same
Schema.\n\nSchema of RecordBatch %d is\n\n\t%s\n\nExpected RecordBatch Schema
to be\n\n\t%s";
+ msg = compose(msg, inconsistentSchemaIndex,
inconsistentSchema, expectedSchema);
+ error("arrow:Table:FromRecordBatches:InconsistentSchema",
msg);
+ end
+ end
+
+ % TODO: Rename getArrayProxyIDs to getProxyIDs
+ proxyIDs = arrow.array.internal.getArrayProxyIDs(batches);
+ args = struct(Method="from_record_batches",
RecordBatchProxyIDs=proxyIDs);
proxyName = "arrow.tabular.proxy.Table";
proxy = arrow.internal.proxy.create(proxyName, args);
arrowTable = arrow.tabular.Table(proxy);
diff --git a/matlab/src/matlab/+arrow/table.m b/matlab/src/matlab/+arrow/table.m
index 1f54481433..0af824cac0 100644
--- a/matlab/src/matlab/+arrow/table.m
+++ b/matlab/src/matlab/+arrow/table.m
@@ -20,14 +20,7 @@ function arrowTable = table(matlabTable)
% ambiguous name parsing issue with MATLAB table type and arrow.table.
matlabTable {istable} = table.empty(0, 0)
end
-
arrowArrays = arrow.tabular.internal.decompose(matlabTable);
- arrayProxyIDs = arrow.array.internal.getArrayProxyIDs(arrowArrays);
-
columnNames = string(matlabTable.Properties.VariableNames);
- args = struct(ArrayProxyIDs=arrayProxyIDs, ColumnNames=columnNames);
- proxyName = "arrow.tabular.proxy.Table";
- proxy = arrow.internal.proxy.create(proxyName, args);
-
- arrowTable = arrow.tabular.Table(proxy);
+ arrowTable = arrow.tabular.Table.fromArrays(arrowArrays{:},
ColumnNames=columnNames);
end
diff --git a/matlab/test/arrow/tabular/tTable.m
b/matlab/test/arrow/tabular/tTable.m
index 63b21bdc09..8e5883232e 100644
--- a/matlab/test/arrow/tabular/tTable.m
+++ b/matlab/test/arrow/tabular/tTable.m
@@ -664,6 +664,57 @@ classdef tTable < matlab.unittest.TestCase
testCase.verifyFalse(isequal(t1, t2, t3, t4));
end
+ function FromRecordBatchesZeroInputsError(testCase)
+ % Verify the arrow.tabular.Table.fromRecordBatches function
+ % throws an `arrow:Table:FromRecordBatches:ZeroBatches`
+ % exception if called with zero input arguments.
+ import arrow.tabular.Table
+ fcn = @() Table.fromRecordBatches();
+ testCase.verifyError(fcn,
"arrow:Table:FromRecordBatches:ZeroBatches");
+ end
+
+ function FromRecordBatchesOneInput(testCase)
+ % Verify the arrow.tabular.Table.fromRecordBatches function
+ % returns the expected arrow.tabular.Table instance when
+ % provided a single RecordBatch as input.
+ import arrow.tabular.Table
+ matlabTable = table([1; 2], ["A"; "B"], VariableNames=["Number"
"Letter"]);
+ recordBatch = arrow.recordBatch(matlabTable);
+ arrowTable = Table.fromRecordBatches(recordBatch);
+ testCase.verifyTable(arrowTable, ["Number", "Letter"],
["arrow.type.Float64Type", "arrow.type.StringType"], matlabTable);
+ end
+
+ function FromRecordBatchesMultipleInputs(testCase)
+ % Verify the arrow.tabular.Table.fromRecordBatches function
+ % returns the expected arrow.tabular.Table instance when
+ % provided mulitple RecordBatches as input.
+ import arrow.tabular.Table
+ matlabTable1 = table([1; 2], ["A"; "B"], VariableNames=["Number"
"Letter"]);
+ matlabTable2 = table([10; 20; 30], ["A1"; "B1"; "C1"],
VariableNames=["Number" "Letter"]);
+ matlabTable3 = table([100; 200], ["A2"; "B2"],
VariableNames=["Number" "Letter"]);
+
+ recordBatch1 = arrow.recordBatch(matlabTable1);
+ recordBatch2 = arrow.recordBatch(matlabTable2);
+ recordBatch3 = arrow.recordBatch(matlabTable3);
+
+ arrowTable = Table.fromRecordBatches(recordBatch1, recordBatch2,
recordBatch3);
+ testCase.verifyTable(arrowTable, ["Number", "Letter"],
["arrow.type.Float64Type", "arrow.type.StringType"], [matlabTable1;
matlabTable2; matlabTable3]);
+ end
+
+ function FromRecordBatchesInconsistentSchemaError(testCase)
+ % Verify the arrow.tabular.Table.fromRecordBatches function
+ % throws an `arrow:Table:FromRecordBatches:InconsistentSchema`
+ % exception if the Schemas of the provided RecordBatches are
+ % inconsistent.
+ import arrow.tabular.Table
+ matlabTable1 = table("A", 1);
+ matlabTable2 = table(2, "B");
+ recordBatch1 = arrow.recordBatch(matlabTable1);
+ recordBatch2 = arrow.recordBatch(matlabTable2);
+
+ fcn = @() Table.fromRecordBatches(recordBatch1, recordBatch2);
+ testCase.verifyError(fcn,
"arrow:Table:FromRecordBatches:InconsistentSchema");
+ end
end
methods