This is an automated email from the ASF dual-hosted git repository.

sgilmore pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 8515a08b92 GH-46877: [MATLAB] Add 
`arrow.tabular.Table.fromRecordBatches` static method (#46885)
8515a08b92 is described below

commit 8515a08b92b92ea6ba7ca6588ed336487322fb43
Author: Sarah Gilmore <[email protected]>
AuthorDate: Mon Jun 23 15:43:48 2025 -0400

    GH-46877: [MATLAB] Add `arrow.tabular.Table.fromRecordBatches` static 
method (#46885)
    
    ### Rationale for this change
    
    This change makes it possible to create an `arrow.tabular.Table` instance 
from a list of `arrow.tabular.RecordBatch` instances whose `Schema`s are 
consistent.
    
    ### What changes are included in this PR?
    
    Added a new static method called `fromRecordBatches` to the MATLAB class 
`arrow.tabular.Table`. This method should construct an `arrow.tabular.Table` 
from a variable number of `arrow.tabular.RecordBatch`es.
    
    **Usage Example**
    ```matlab
    >> rb1 = arrow.recordBatch(table([1:5]', [6:10]'));
    >> rb2 = arrow.recordBatch(table([11:15]', [16:20]'));
    
    >> table = arrow.tabular.Table.fromRecordBatches(rb1, rb2)
    
    table =
    
      Arrow Table with 10 rows and 2 columns:
    
        Schema:
    
            Var1: Float64 | Var2: Float64
    
        First Row:
    
            1 | 6
    ```
    
    **Error Message Examples**
    ```matlab
    % Error message when fromRecordBatches is called with zero input arguments
    >> arrow.tabular.Table.fromRecordBatches()
    Error using arrow.tabular.Table.fromRecordBatches (line 154)
    The fromRecordBatches method requires at least one RecordBatch to be 
supplied.
    
    % Error message when fromRecordBatches is given RecordBatches whose Schemas 
are inconsistent.
    >> rb1 = arrow.recordBatch(table(1, 2, VariableNames=["Num1", "Num2"]));
    >> rb2 = arrow.recordBatch(table(1, "A", VariableNames=["Num1", 
"Letter1"]));
    >> arrow.tabular.Table.fromRecordBatches(rb1, rb2)
    Error using arrow.tabular.Table.fromRecordBatches (line 167)
    All RecordBatches must have the same Schema.
    
    Schema of RecordBatch 2 is
    
            Num1: Float64 | Letter1: String
    
    Expected RecordBatch Schema to be
    
            Num1: Float64 | Num2: Float64
    ```
    
    ### Are these changes tested?
    
    Yes. Added four new test cases to the MATLAB test class `tTable`:
    
    1. `FromRecordBatchesZeroInputsError`
    2. `FromRecordBatchesOneInput`
    3. `FromRecordBatchesMultipleInputs`
    4. `FromRecordBatchesInconsistentSchemaError`
    
    ### Are there any user-facing changes?
    
    Yes. Users can now create an `arrow.tabular.Table` instance via the static 
method `fromRecordBatches`.
    * GitHub Issue: #46877
    
    Lead-authored-by: Sarah Gilmore <[email protected]>
    Co-authored-by: Sarah Gilmore <[email protected]>
    Co-authored-by: Kevin Gurney <[email protected]>
    Signed-off-by: Sarah Gilmore <[email protected]>
---
 matlab/src/cpp/arrow/matlab/error/error.h          |  1 +
 matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc | 71 +++++++++++++++++++---
 matlab/src/matlab/+arrow/+tabular/Table.m          | 33 +++++++++-
 matlab/src/matlab/+arrow/table.m                   |  9 +--
 matlab/test/arrow/tabular/tTable.m                 | 51 ++++++++++++++++
 5 files changed, 149 insertions(+), 16 deletions(-)

diff --git a/matlab/src/cpp/arrow/matlab/error/error.h 
b/matlab/src/cpp/arrow/matlab/error/error.h
index 47bde56dac..74b8d5fd41 100644
--- a/matlab/src/cpp/arrow/matlab/error/error.h
+++ b/matlab/src/cpp/arrow/matlab/error/error.h
@@ -253,5 +253,6 @@ static const char* IPC_RECORD_BATCH_READ_INVALID_INDEX = 
"arrow:io:ipc:InvalidIn
 static const char* IPC_RECORD_BATCH_READ_FAILED = "arrow:io:ipc:ReadFailed";
 static const char* IPC_TABLE_READ_FAILED = "arrow:io:ipc:TableReadFailed";
 static const char* IPC_END_OF_STREAM = "arrow:io:ipc:EndOfStream";
+static const char* TABLE_MAKE_UNKNOWN_METHOD = "arrow:table:UnknownMakeMethod";
 
 }  // namespace arrow::matlab::error
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc 
b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
index d7e31de4e7..ff7de96e34 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
@@ -18,11 +18,13 @@
 #include "libmexclass/proxy/ProxyManager.h"
 
 #include "arrow/matlab/array/proxy/array.h"
+
 #include "arrow/matlab/array/proxy/chunked_array.h"
 #include "arrow/matlab/array/proxy/wrap.h"
 
 #include "arrow/matlab/error/error.h"
 #include "arrow/matlab/tabular/get_row_as_string.h"
+#include "arrow/matlab/tabular/proxy/record_batch.h"
 #include "arrow/matlab/tabular/proxy/schema.h"
 #include "arrow/matlab/tabular/proxy/table.h"
 
@@ -34,6 +36,8 @@
 
 namespace arrow::matlab::tabular::proxy {
 
+namespace mda = ::matlab::data;
+
 namespace {
 libmexclass::error::Error makeEmptyTableError() {
   const std::string error_msg =
@@ -70,7 +74,6 @@ Table::Table(std::shared_ptr<arrow::Table> table) : 
table{table} {
 std::shared_ptr<arrow::Table> Table::unwrap() { return table; }
 
 void Table::toString(libmexclass::proxy::method::Context& context) {
-  namespace mda = ::matlab::data;
   MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_string,
                                       
arrow::util::UTF8StringToUTF16(table->ToString()),
                                       context, 
error::UNICODE_CONVERSION_ERROR_ID);
@@ -79,12 +82,11 @@ void Table::toString(libmexclass::proxy::method::Context& 
context) {
   context.outputs[0] = str_mda;
 }
 
-libmexclass::proxy::MakeResult Table::make(
-    const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+namespace {
+libmexclass::proxy::MakeResult from_arrays(const mda::StructArray& opts) {
   using ArrayProxy = arrow::matlab::array::proxy::Array;
   using TableProxy = arrow::matlab::tabular::proxy::Table;
-  namespace mda = ::matlab::data;
-  mda::StructArray opts = constructor_arguments[0];
+
   const mda::TypedArray<uint64_t> arrow_array_proxy_ids = 
opts[0]["ArrayProxyIDs"];
   const mda::StringArray column_names = opts[0]["ColumnNames"];
 
@@ -114,9 +116,64 @@ libmexclass::proxy::MakeResult Table::make(
                          error::SCHEMA_BUILDER_FINISH_ERROR_ID);
   const auto num_rows = arrow_arrays.size() == 0 ? 0 : 
arrow_arrays[0]->length();
   const auto table = arrow::Table::Make(schema, arrow_arrays, num_rows);
-  auto table_proxy = std::make_shared<TableProxy>(table);
+  return std::make_shared<TableProxy>(table);
+}
+
+libmexclass::proxy::MakeResult from_record_batches(const mda::StructArray& 
opts) {
+  using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch;
+  using TableProxy = arrow::matlab::tabular::proxy::Table;
 
-  return table_proxy;
+  size_t num_rows = 0;
+  const mda::TypedArray<uint64_t> record_batch_proxy_ids = 
opts[0]["RecordBatchProxyIDs"];
+
+  std::vector<std::shared_ptr<arrow::RecordBatch>> record_batches;
+  // Retrieve all of the Arrow RecordBatch Proxy instances from the libmexclass
+  // ProxyManager.
+  for (const auto& proxy_id : record_batch_proxy_ids) {
+    auto proxy = libmexclass::proxy::ProxyManager::getProxy(proxy_id);
+    auto record_batch_proxy = std::static_pointer_cast<RecordBatch>(proxy);
+    auto record_batch = record_batch_proxy->unwrap();
+    record_batches.push_back(record_batch);
+    num_rows += record_batches.back()->num_rows();
+  }
+
+  // The MATLAB client code that calls this function is responsible for 
pre-validating
+  // that this function is called with at least one RecordBatch.
+  auto schema = record_batches[0]->schema();
+  size_t num_columns = schema->num_fields();
+  std::vector<std::shared_ptr<ChunkedArray>> columns(num_columns);
+
+  size_t num_batches = record_batches.size();
+
+  for (size_t i = 0; i < num_columns; ++i) {
+    std::vector<std::shared_ptr<Array>> column_arrays(num_batches);
+    for (size_t j = 0; j < num_batches; ++j) {
+      column_arrays[j] = record_batches[j]->column(i);
+    }
+    columns[i] = std::make_shared<ChunkedArray>(column_arrays, 
schema->field(i)->type());
+  }
+  const auto table = arrow::Table::Make(std::move(schema), std::move(columns), 
num_rows);
+  return std::make_shared<TableProxy>(table);
+}
+}  // anonymous namespace
+
+libmexclass::proxy::MakeResult Table::make(
+    const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+  mda::StructArray opts = constructor_arguments[0];
+  const mda::StringArray method = opts[0]["Method"];
+
+  if (method[0] == u"from_arrays") {
+    return from_arrays(opts);
+  } else if (method[0] == u"from_record_batches") {
+    return from_record_batches(opts);
+  } else {
+    const auto method_name_utf16 = std::u16string(method[0]);
+    MATLAB_ASSIGN_OR_ERROR(const auto method_name_utf8,
+                           arrow::util::UTF16StringToUTF8(method_name_utf16),
+                           error::UNICODE_CONVERSION_ERROR_ID);
+    const std::string error_msg = "Unknown make method: " + method_name_utf8;
+    return libmexclass::error::Error{error::TABLE_MAKE_UNKNOWN_METHOD, 
error_msg};
+  }
 }
 
 void Table::getNumRows(libmexclass::proxy::method::Context& context) {
diff --git a/matlab/src/matlab/+arrow/+tabular/Table.m 
b/matlab/src/matlab/+arrow/+tabular/Table.m
index 1ed205d639..48e6b09edc 100644
--- a/matlab/src/matlab/+arrow/+tabular/Table.m
+++ b/matlab/src/matlab/+arrow/+tabular/Table.m
@@ -139,7 +139,38 @@ classdef Table < matlab.mixin.CustomDisplay & 
matlab.mixin.Scalar
             validateColumnNames(opts.ColumnNames, numColumns);
 
             arrayProxyIDs = getArrayProxyIDs(arrowArrays);
-            args = struct(ArrayProxyIDs=arrayProxyIDs, 
ColumnNames=opts.ColumnNames);
+            args = struct(Method="from_arrays", ArrayProxyIDs=arrayProxyIDs, 
ColumnNames=opts.ColumnNames);
+            proxyName = "arrow.tabular.proxy.Table";
+            proxy = arrow.internal.proxy.create(proxyName, args);
+            arrowTable = arrow.tabular.Table(proxy);
+        end
+
+        function arrowTable = fromRecordBatches(batches)
+            arguments(Repeating)
+                batches(1, 1) arrow.tabular.RecordBatch
+            end
+            if numel(batches) == 0
+                msg = "The fromRecordBatches method requires at least one 
RecordBatch to be supplied.";
+                error("arrow:Table:FromRecordBatches:ZeroBatches", msg);
+            elseif numel(batches) > 1
+                % Verify that all supplied RecordBatches have a consistent 
Schema.
+                firstSchema = batches{1}.Schema;
+                otherSchemas = cellfun(@(rb) rb.Schema, batches(2:end), 
UniformOutput=false);
+                idx = cellfun(@(other) ~isequal(firstSchema, other), 
otherSchemas, UniformOutput=true);
+                inconsistentSchemaIndex = find(idx, 1,"first");
+                if ~isempty(inconsistentSchemaIndex)
+                    inconsistentSchemaIndex = inconsistentSchemaIndex + 1;
+                    expectedSchema = 
arrow.tabular.internal.display.getSchemaString(firstSchema);
+                    inconsistentSchema = 
arrow.tabular.internal.display.getSchemaString(batches{inconsistentSchemaIndex}.Schema);
+                    msg = "All RecordBatches must have the same 
Schema.\n\nSchema of RecordBatch %d is\n\n\t%s\n\nExpected RecordBatch Schema 
to be\n\n\t%s";
+                    msg = compose(msg, inconsistentSchemaIndex, 
inconsistentSchema, expectedSchema);
+                    error("arrow:Table:FromRecordBatches:InconsistentSchema", 
msg);
+                end
+            end
+
+            % TODO: Rename getArrayProxyIDs to getProxyIDs
+            proxyIDs = arrow.array.internal.getArrayProxyIDs(batches);
+            args = struct(Method="from_record_batches", 
RecordBatchProxyIDs=proxyIDs);
             proxyName = "arrow.tabular.proxy.Table";
             proxy = arrow.internal.proxy.create(proxyName, args);
             arrowTable = arrow.tabular.Table(proxy);
diff --git a/matlab/src/matlab/+arrow/table.m b/matlab/src/matlab/+arrow/table.m
index 1f54481433..0af824cac0 100644
--- a/matlab/src/matlab/+arrow/table.m
+++ b/matlab/src/matlab/+arrow/table.m
@@ -20,14 +20,7 @@ function arrowTable = table(matlabTable)
         % ambiguous name parsing issue with MATLAB table type and arrow.table.
         matlabTable {istable} = table.empty(0, 0)
     end
-
     arrowArrays = arrow.tabular.internal.decompose(matlabTable);
-    arrayProxyIDs = arrow.array.internal.getArrayProxyIDs(arrowArrays);
-
     columnNames = string(matlabTable.Properties.VariableNames);
-    args = struct(ArrayProxyIDs=arrayProxyIDs, ColumnNames=columnNames);
-    proxyName = "arrow.tabular.proxy.Table";
-    proxy = arrow.internal.proxy.create(proxyName, args);
-
-    arrowTable = arrow.tabular.Table(proxy);
+    arrowTable = arrow.tabular.Table.fromArrays(arrowArrays{:}, 
ColumnNames=columnNames);
 end
diff --git a/matlab/test/arrow/tabular/tTable.m 
b/matlab/test/arrow/tabular/tTable.m
index 63b21bdc09..8e5883232e 100644
--- a/matlab/test/arrow/tabular/tTable.m
+++ b/matlab/test/arrow/tabular/tTable.m
@@ -664,6 +664,57 @@ classdef tTable < matlab.unittest.TestCase
             testCase.verifyFalse(isequal(t1, t2, t3, t4));
         end
 
+        function FromRecordBatchesZeroInputsError(testCase)
+            % Verify the arrow.tabular.Table.fromRecordBatches function
+            % throws an `arrow:Table:FromRecordBatches:ZeroBatches` 
+            % exception if called with zero input arguments.
+            import arrow.tabular.Table
+            fcn = @() Table.fromRecordBatches();
+            testCase.verifyError(fcn, 
"arrow:Table:FromRecordBatches:ZeroBatches");
+        end
+
+        function FromRecordBatchesOneInput(testCase)
+            % Verify the arrow.tabular.Table.fromRecordBatches function
+            % returns the expected arrow.tabular.Table instance when 
+            % provided a single RecordBatch as input.
+            import arrow.tabular.Table
+            matlabTable = table([1; 2], ["A"; "B"], VariableNames=["Number" 
"Letter"]);
+            recordBatch = arrow.recordBatch(matlabTable);
+            arrowTable = Table.fromRecordBatches(recordBatch);
+            testCase.verifyTable(arrowTable, ["Number", "Letter"], 
["arrow.type.Float64Type", "arrow.type.StringType"], matlabTable);
+        end
+
+        function FromRecordBatchesMultipleInputs(testCase)
+            % Verify the arrow.tabular.Table.fromRecordBatches function
+            % returns the expected arrow.tabular.Table instance when 
+            % provided mulitple RecordBatches as input.
+            import arrow.tabular.Table
+            matlabTable1 = table([1; 2], ["A"; "B"], VariableNames=["Number" 
"Letter"]);
+            matlabTable2 = table([10; 20; 30], ["A1"; "B1"; "C1"], 
VariableNames=["Number" "Letter"]);
+            matlabTable3 = table([100; 200], ["A2"; "B2"], 
VariableNames=["Number" "Letter"]);
+
+            recordBatch1 = arrow.recordBatch(matlabTable1);
+            recordBatch2 = arrow.recordBatch(matlabTable2);
+            recordBatch3 = arrow.recordBatch(matlabTable3);
+
+            arrowTable = Table.fromRecordBatches(recordBatch1, recordBatch2, 
recordBatch3);
+            testCase.verifyTable(arrowTable, ["Number", "Letter"], 
["arrow.type.Float64Type", "arrow.type.StringType"], [matlabTable1; 
matlabTable2; matlabTable3]);
+        end
+
+        function FromRecordBatchesInconsistentSchemaError(testCase)
+            % Verify the arrow.tabular.Table.fromRecordBatches function
+            % throws an `arrow:Table:FromRecordBatches:InconsistentSchema`
+            % exception if the Schemas of the provided  RecordBatches are 
+            % inconsistent.
+            import arrow.tabular.Table
+            matlabTable1 = table("A", 1);
+            matlabTable2 = table(2, "B");
+            recordBatch1 = arrow.recordBatch(matlabTable1);
+            recordBatch2 = arrow.recordBatch(matlabTable2);
+
+            fcn = @() Table.fromRecordBatches(recordBatch1, recordBatch2);
+            testCase.verifyError(fcn, 
"arrow:Table:FromRecordBatches:InconsistentSchema");
+        end
     end
 
     methods

Reply via email to