This is an automated email from the ASF dual-hosted git repository.
kevingurney pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 71329ce33a GH-37042: [MATLAB] Implement Feather V1 Writer using new
MATLAB Interface APIs (#37043)
71329ce33a is described below
commit 71329ce33a18a53e322514d0e463677ebad648c9
Author: sgilmore10 <[email protected]>
AuthorDate: Mon Aug 7 15:22:03 2023 -0400
GH-37042: [MATLAB] Implement Feather V1 Writer using new MATLAB Interface
APIs (#37043)
### Rationale for this change
Now that we've have the basic building blocks for tabular IO in the MATLAB
Interface (`Array`, `Schema`, `RecordBatch`), we can implement a Feather V1
writer in terms of the new APIs.
This is the first in a series of pull requests in which we will work on
replacing the legacy feather V1 infrastructure with a new implementation that
use the MATLAB Interface APIs. A side effect of doing this work is that we can
eventually delete a lot of legacy build infrastructure and code.
### What changes are included in this PR?
1. Added a new class called `arrow.internal.io.feather.Writer` which can be
used to write feather V1 files. It has one public property named `Filename` and
one public method `write`.
Below is an example of its usage:
```matlab
>> T = table([1; 2; 3], single([10; 11; 12]));
T =
3×2 table
Var1 Var2
____ ____
1 10
2 11
3 12
>> filename = "/tmp/table.feather";
>> writer = arrow.internal.io.feather.Writer(filename)
writer =
Writer with properties:
Filename: "/tmp/table.feather"
>> writer.write(T);
```
2. Added an `unwrap` method to `proxy::RecordBatch` so that the
`FeatherWriter::write` method can access the underlying `RecordBatch` from the
proxy.
3. Changed the `SetAccess` and `GetAccess` of the `Proxy` property on
`arrow.tabular.RecordBatch` to `private` and `public`, respectively.
### Are these changes tested?
Yes, added a new test file called `tRoundTrip.m` in the
`matlab/test/arrow/io/feather` folder.
### Are there any user-facing changes?
No.
### Future Directions
1. Add a new class for reading feather V1 files (See #37041).
2. Integrate this class in the public `featherwrite` function.
5. Once this class is integrated with `featherwrite`, we can delete the
legacy build infrastructure and source code.
* Closes: #37042
Authored-by: Sarah Gilmore <[email protected]>
Signed-off-by: Kevin Gurney <[email protected]>
---
matlab/src/cpp/arrow/matlab/error/error.h | 4 +
.../matlab/io/feather/proxy/feather_writer.cc | 90 ++++++++++++++++++++++
.../feather/proxy/feather_writer.h} | 24 +++---
matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 +
.../cpp/arrow/matlab/tabular/proxy/record_batch.cc | 4 +
.../cpp/arrow/matlab/tabular/proxy/record_batch.h | 2 +
.../matlab/+arrow/+internal/+io/+feather/Writer.m | 48 ++++++++++++
matlab/src/matlab/+arrow/+tabular/RecordBatch.m | 2 +-
matlab/test/arrow/io/feather/tRoundTrip.m | 52 +++++++++++++
matlab/tools/cmake/BuildMatlabArrowInterface.cmake | 4 +-
10 files changed, 217 insertions(+), 15 deletions(-)
diff --git a/matlab/src/cpp/arrow/matlab/error/error.h
b/matlab/src/cpp/arrow/matlab/error/error.h
index b7c0d7d696..e1d2982f28 100644
--- a/matlab/src/cpp/arrow/matlab/error/error.h
+++ b/matlab/src/cpp/arrow/matlab/error/error.h
@@ -180,4 +180,8 @@ namespace arrow::matlab::error {
static const char* UNKNOWN_PROXY_FOR_ARRAY_TYPE =
"arrow:array:UnknownProxyForArrayType";
static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH =
"arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch";
static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX =
"arrow:tabular:recordbatch:InvalidNumericColumnIndex";
+ static const char* FAILED_TO_OPEN_FILE_FOR_WRITE =
"arrow:io:FailedToOpenFileForWrite";
+ static const char* FEATHER_FAILED_TO_WRITE_TABLE =
"arrow:io:feather:FailedToWriteTable";
+ static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch";
+
}
diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc
b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc
new file mode 100644
index 0000000000..a27e1fb0e6
--- /dev/null
+++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/matlab/io/feather/proxy/feather_writer.h"
+#include "arrow/matlab/tabular/proxy/record_batch.h"
+#include "arrow/matlab/error/error.h"
+
+#include "arrow/result.h"
+#include "arrow/table.h"
+#include "arrow/util/utf8.h"
+
+#include "arrow/io/file.h"
+#include "arrow/ipc/feather.h"
+
+#include "libmexclass/proxy/ProxyManager.h"
+
+namespace arrow::matlab::io::feather::proxy {
+
+ FeatherWriter::FeatherWriter(const std::string& filename) :
filename{filename} {
+ REGISTER_METHOD(FeatherWriter, getFilename);
+ REGISTER_METHOD(FeatherWriter, write);
+ }
+
+ libmexclass::proxy::MakeResult FeatherWriter::make(const
libmexclass::proxy::FunctionArguments& constructor_arguments) {
+ namespace mda = ::matlab::data;
+ mda::StructArray opts = constructor_arguments[0];
+ const mda::StringArray filename_mda = opts[0]["Filename"];
+
+ const auto filename_utf16 = std::u16string(filename_mda[0]);
+ MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8,
+ arrow::util::UTF16StringToUTF8(filename_utf16),
+ error::UNICODE_CONVERSION_ERROR_ID);
+
+ return std::make_shared<FeatherWriter>(filename_utf8);
+ }
+
+ void FeatherWriter::getFilename(libmexclass::proxy::method::Context&
context) {
+ namespace mda = ::matlab::data;
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename,
+
arrow::util::UTF8StringToUTF16(filename),
+ context,
+
error::UNICODE_CONVERSION_ERROR_ID);
+ mda::ArrayFactory factory;
+ auto str_mda = factory.createScalar(utf16_filename);
+ context.outputs[0] = str_mda;
+ }
+
+ void FeatherWriter::write(libmexclass::proxy::method::Context& context) {
+ namespace mda = ::matlab::data;
+ mda::StructArray opts = context.inputs[0];
+ const mda::TypedArray<uint64_t> record_batch_proxy_id_mda =
opts[0]["RecordBatchProxyID"];
+ const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0];
+
+ auto proxy =
libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id);
+ auto record_batch_proxy =
std::static_pointer_cast<arrow::matlab::tabular::proxy::RecordBatch>(proxy);
+ auto record_batch = record_batch_proxy->unwrap();
+
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table,
+
arrow::Table::FromRecordBatches({record_batch}),
+ context,
+ error::TABLE_FROM_RECORD_BATCH);
+
+
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(std::shared_ptr<arrow::io::OutputStream>
output_stream,
+
arrow::io::FileOutputStream::Open(filename),
+ context,
+
error::FAILED_TO_OPEN_FILE_FOR_WRITE);
+
+ // Specify the feather file format version as V1
+ arrow::ipc::feather::WriteProperties write_props;
+ write_props.version = arrow::ipc::feather::kFeatherV1Version;
+
+ MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(ipc::feather::WriteTable(*table,
output_stream.get(), write_props),
+ context,
+
error::FEATHER_FAILED_TO_WRITE_TABLE);
+ }
+}
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
similarity index 59%
copy from matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
copy to matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
index b5d741060a..dadb479887 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
+++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
@@ -17,27 +17,25 @@
#pragma once
-#include "arrow/record_batch.h"
+#include "arrow/status.h"
#include "libmexclass/proxy/Proxy.h"
-namespace arrow::matlab::tabular::proxy {
+namespace arrow::matlab::io::feather::proxy {
- class RecordBatch : public libmexclass::proxy::Proxy {
+ class FeatherWriter : public libmexclass::proxy::Proxy {
public:
- RecordBatch(std::shared_ptr<arrow::RecordBatch> record_batch);
+ FeatherWriter(const std::string& filename);
- virtual ~RecordBatch() {}
+ ~FeatherWriter() {}
- static libmexclass::proxy::MakeResult make(const
libmexclass::proxy::FunctionArguments& constructor_arguments);
-
+ static libmexclass::proxy::MakeResult make(const
libmexclass::proxy::FunctionArguments& constructor_arguments);
+
protected:
- void toString(libmexclass::proxy::method::Context& context);
- void numColumns(libmexclass::proxy::method::Context& context);
- void columnNames(libmexclass::proxy::method::Context& context);
- void getColumnByIndex(libmexclass::proxy::method::Context&
context);
+ void getFilename(libmexclass::proxy::method::Context& context);
+ void write(libmexclass::proxy::method::Context& context);
- std::shared_ptr<arrow::RecordBatch> record_batch;
+ private:
+ const std::string filename;
};
-
}
diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc
b/matlab/src/cpp/arrow/matlab/proxy/factory.cc
index 7d18c6c6b6..7a2a4f3192 100644
--- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc
+++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc
@@ -25,6 +25,7 @@
#include "arrow/matlab/type/proxy/string_type.h"
#include "arrow/matlab/type/proxy/timestamp_type.h"
#include "arrow/matlab/type/proxy/field.h"
+#include "arrow/matlab/io/feather/proxy/feather_writer.h"
#include "factory.h"
@@ -60,6 +61,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const
ClassName& class_name,
REGISTER_PROXY(arrow.type.proxy.BooleanType ,
arrow::matlab::type::proxy::PrimitiveCType<bool>);
REGISTER_PROXY(arrow.type.proxy.StringType ,
arrow::matlab::type::proxy::StringType);
REGISTER_PROXY(arrow.type.proxy.TimestampType ,
arrow::matlab::type::proxy::TimestampType);
+ REGISTER_PROXY(arrow.io.feather.proxy.FeatherWriter ,
arrow::matlab::io::feather::proxy::FeatherWriter);
return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not
find matching C++ proxy for " + class_name};
};
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
index ed30472f6c..e159e926ec 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
@@ -56,6 +56,10 @@ namespace arrow::matlab::tabular::proxy {
REGISTER_METHOD(RecordBatch, getColumnByIndex);
}
+ std::shared_ptr<arrow::RecordBatch> RecordBatch::unwrap() {
+ return record_batch;
+ }
+
void RecordBatch::toString(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_string,
arrow::util::UTF8StringToUTF16(record_batch->ToString()), context,
error::UNICODE_CONVERSION_ERROR_ID);
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
index b5d741060a..b8c038816b 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
@@ -29,6 +29,8 @@ namespace arrow::matlab::tabular::proxy {
virtual ~RecordBatch() {}
+ std::shared_ptr<arrow::RecordBatch> unwrap();
+
static libmexclass::proxy::MakeResult make(const
libmexclass::proxy::FunctionArguments& constructor_arguments);
protected:
diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
new file mode 100644
index 0000000000..470c41fd5b
--- /dev/null
+++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
@@ -0,0 +1,48 @@
+%WRITER Class for writing feather V1 files.
+
+% Licensed to the Apache Software Foundation (ASF) under one or more
+% contributor license agreements. See the NOTICE file distributed with
+% this work for additional information regarding copyright ownership.
+% The ASF licenses this file to you under the Apache License, Version
+% 2.0 (the "License"); you may not use this file except in compliance
+% with the License. You may obtain a copy of the License at
+%
+% http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS,
+% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+% implied. See the License for the specific language governing
+% permissions and limitations under the License.
+classdef Writer < matlab.mixin.Scalar
+
+ properties(Hidden, SetAccess=private, GetAccess=public)
+ Proxy
+ end
+
+ properties(Dependent)
+ Filename
+ end
+
+ methods
+ function obj = Writer(filename)
+ arguments
+ filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText}
+ end
+
+ args = struct(Filename=filename);
+ proxyName = "arrow.io.feather.proxy.FeatherWriter";
+ obj.Proxy = arrow.internal.proxy.create(proxyName, args);
+ end
+
+ function write(obj, T)
+ rb = arrow.recordbatch(T);
+ args = struct(RecordBatchProxyID=rb.Proxy.ID);
+ obj.Proxy.write(args);
+ end
+
+ function filename = get.Filename(obj)
+ filename = obj.Proxy.getFilename();
+ end
+ end
+end
\ No newline at end of file
diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m
b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m
index 0d002797f0..be5eee7d89 100644
--- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m
+++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m
@@ -23,7 +23,7 @@ classdef RecordBatch < matlab.mixin.CustomDisplay & ...
ColumnNames
end
- properties (Access=protected)
+ properties (Hidden, SetAccess=private, GetAccess=public)
Proxy
end
diff --git a/matlab/test/arrow/io/feather/tRoundTrip.m
b/matlab/test/arrow/io/feather/tRoundTrip.m
new file mode 100644
index 0000000000..d56152be6d
--- /dev/null
+++ b/matlab/test/arrow/io/feather/tRoundTrip.m
@@ -0,0 +1,52 @@
+%TROUNDTRIP Round trip tests for feather.
+
+% Licensed to the Apache Software Foundation (ASF) under one or more
+% contributor license agreements. See the NOTICE file distributed with
+% this work for additional information regarding copyright ownership.
+% The ASF licenses this file to you under the Apache License, Version
+% 2.0 (the "License"); you may not use this file except in compliance
+% with the License. You may obtain a copy of the License at
+%
+% http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS,
+% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+% implied. See the License for the specific language governing
+% permissions and limitations under the License.
+classdef tRoundTrip < matlab.unittest.TestCase
+
+ methods(TestClassSetup)
+ % Delete once arrow.internal.io.feather.Reader is submitted.
+ function addFeatherFunctionsToMATLABPath(testCase)
+ import matlab.unittest.fixtures.PathFixture
+ % Add Feather test utilities to the MATLAB path.
+ testCase.applyFixture(PathFixture('../../../util'));
+ % arrow.cpp.call must be on the MATLAB path.
+ testCase.assertTrue(~isempty(which('arrow.cpp.call')), ...
+ '''arrow.cpp.call'' must be on the MATLAB path. Use
''addpath'' to add folders to the MATLAB path.');
+ end
+ end
+
+ methods(Test)
+ function Basic(testCase)
+ import matlab.unittest.fixtures.TemporaryFolderFixture
+
+ fixture = testCase.applyFixture(TemporaryFolderFixture);
+ filename = fullfile(fixture.Folder, "temp.feather");
+
+ DoubleVar = [10; 20; 30; 40];
+ SingleVar = single([10; 15; 20; 25]);
+ tWrite = table(DoubleVar, SingleVar);
+
+ featherwrite(tWrite, filename);
+ tRead = featherread(filename);
+ testCase.verifyEqual(tWrite, tRead);
+ end
+ end
+end
+
+function featherwrite(T, filename)
+ writer = arrow.internal.io.feather.Writer(filename);
+ writer.write(T);
+end
\ No newline at end of file
diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
index f4696cfad2..1d57999417 100644
--- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
+++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
@@ -55,7 +55,9 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES
"${CMAKE_SOURCE_DIR}/src/cpp/a
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/string_type.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc"
-
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc")
+
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc"
+
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc")
+
set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy")