This is an automated email from the ASF dual-hosted git repository.
kevingurney pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 152be67100 GH-37041: [MATLAB] Implement Feather V1 Reader using new
MATLAB Interface APIs (#37044)
152be67100 is described below
commit 152be67100cdd367a3e6064988085e3b327ad0fe
Author: Kevin Gurney <[email protected]>
AuthorDate: Mon Aug 7 16:26:16 2023 -0400
GH-37041: [MATLAB] Implement Feather V1 Reader using new MATLAB Interface
APIs (#37044)
### Rationale for this change
Now that we've have the basic building blocks for tabular IO in the MATLAB
Interface (Array, Schema, RecordBatch), we can implement a Feather V1 reader in
terms of the new APIs.
This is a follow up to #37043, where a new Feather V1 internal `Writer`
object was added.
### What changes are included in this PR?
1. Added a new class called arrow.internal.io.feather.Reader which can be
used to read Feather V1 files. It has one public property named `Filename` and
one public method named `read`.
**Example Usage:**
```matlab
>> T = array2table(rand(3))
T =
3x3 table
Var1 Var2 Var3
_______ ________ _______
0.79221 0.035712 0.67874
0.95949 0.84913 0.75774
0.65574 0.93399 0.74313
>> filename = "test.feather";
>> featherwrite(filename, T)
>> reader = arrow.internal.io.feather.Reader(filename)
reader =
Reader with properties:
Filename: "test.feather"
>> T = reader.read()
T =
3x3 table
Var1 Var2 Var3
_______ ________ _______
0.79221 0.035712 0.67874
0.95949 0.84913 0.75774
0.65574 0.93399 0.74313
```
### Are these changes tested?
Yes.
1. Added `Reader` to `feather/tRoundTrip.m`.
### Are there any user-facing changes?
No.
These are only internal objects right now.
### Future Directions
1. Re-implement `featherread` in terms of the new `Reader` object.
2. Remove legacy feather code and infrastructure.
### Notes
1. For conciseness, I renamed the C++ Proxy class `FeatherWriter` to
`Writer` since it is already inside of a `feather` namespace / "package".
* Closes: #37041
Authored-by: Kevin Gurney <[email protected]>
Signed-off-by: Kevin Gurney <[email protected]>
---
matlab/src/cpp/arrow/matlab/error/error.h | 6 ++
.../cpp/arrow/matlab/io/feather/proxy/reader.cc | 98 ++++++++++++++++++++++
.../feather/proxy/{feather_writer.h => reader.h} | 20 ++---
.../feather/proxy/{feather_writer.cc => writer.cc} | 16 ++--
.../feather/proxy/{feather_writer.h => writer.h} | 6 +-
matlab/src/cpp/arrow/matlab/proxy/factory.cc | 6 +-
.../+internal/+io/+feather/{Writer.m => Reader.m} | 30 ++++---
.../matlab/+arrow/+internal/+io/+feather/Writer.m | 4 +-
matlab/test/arrow/io/feather/tRoundTrip.m | 5 ++
matlab/tools/cmake/BuildMatlabArrowInterface.cmake | 4 +-
10 files changed, 154 insertions(+), 41 deletions(-)
diff --git a/matlab/src/cpp/arrow/matlab/error/error.h
b/matlab/src/cpp/arrow/matlab/error/error.h
index e1d2982f28..deac5e26fc 100644
--- a/matlab/src/cpp/arrow/matlab/error/error.h
+++ b/matlab/src/cpp/arrow/matlab/error/error.h
@@ -181,7 +181,13 @@ namespace arrow::matlab::error {
static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH =
"arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch";
static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX =
"arrow:tabular:recordbatch:InvalidNumericColumnIndex";
static const char* FAILED_TO_OPEN_FILE_FOR_WRITE =
"arrow:io:FailedToOpenFileForWrite";
+ static const char* FAILED_TO_OPEN_FILE_FOR_READ =
"arrow:io:FailedToOpenFileForRead";
static const char* FEATHER_FAILED_TO_WRITE_TABLE =
"arrow:io:feather:FailedToWriteTable";
static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch";
+ static const char* FEATHER_FAILED_TO_CREATE_READER =
"arrow:io:feather:FailedToCreateReader";
+ static const char* FEATHER_VERSION_2 = "arrow:io:feather:FeatherVersion2";
+ static const char* FEATHER_VERSION_UNKNOWN =
"arrow:io:feather:FeatherVersionUnknown";
+ static const char* FEATHER_FAILED_TO_READ_TABLE =
"arrow:io:feather:FailedToReadTable";
+ static const char* FEATHER_FAILED_TO_READ_RECORD_BATCH =
"arrow:io:feather:FailedToReadRecordBatch";
}
diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc
b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc
new file mode 100644
index 0000000000..a264d24ecb
--- /dev/null
+++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc
@@ -0,0 +1,98 @@
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "libmexclass/proxy/ProxyManager.h"
+
+#include "arrow/matlab/error/error.h"
+#include "arrow/matlab/io/feather/proxy/reader.h"
+#include "arrow/matlab/tabular/proxy/record_batch.h"
+
+#include "arrow/util/utf8.h"
+
+#include "arrow/result.h"
+
+#include "arrow/io/file.h"
+#include "arrow/ipc/feather.h"
+#include "arrow/table.h"
+
+namespace arrow::matlab::io::feather::proxy {
+
+ Reader::Reader(const std::string& filename) : filename{filename} {
+ REGISTER_METHOD(Reader, read);
+ REGISTER_METHOD(Reader, getFilename);
+ }
+
+ libmexclass::proxy::MakeResult Reader::make(const
libmexclass::proxy::FunctionArguments& constructor_arguments) {
+ namespace mda = ::matlab::data;
+ using ReaderProxy = arrow::matlab::io::feather::proxy::Reader;
+
+ mda::StructArray args = constructor_arguments[0];
+ const mda::StringArray filename_utf16_mda = args[0]["Filename"];
+ const auto filename_utf16 = std::u16string(filename_utf16_mda[0]);
+ MATLAB_ASSIGN_OR_ERROR(const auto filename,
arrow::util::UTF16StringToUTF8(filename_utf16),
error::UNICODE_CONVERSION_ERROR_ID);
+
+ return std::make_shared<ReaderProxy>(filename);
+ }
+
+ void Reader::read(libmexclass::proxy::method::Context& context) {
+ namespace mda = ::matlab::data;
+ using namespace libmexclass::proxy;
+ using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch;
+
+ mda::ArrayFactory factory;
+
+ // Create a file input stream.
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto source,
arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), context,
error::FAILED_TO_OPEN_FILE_FOR_READ);
+
+ // Create a Reader from the file input stream.
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto reader,
arrow::ipc::feather::Reader::Open(source), context,
error::FEATHER_FAILED_TO_CREATE_READER);
+
+ // Error if not Feather V1.
+ const auto version = reader->version();
+ if (version == ipc::feather::kFeatherV2Version) {
+
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(Status::NotImplemented("Support for Feather
V2 has not been implemented."), context, error::FEATHER_VERSION_2);
+ } else if (version != ipc::feather::kFeatherV1Version) {
+ MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(Status::Invalid("Unknown
Feather format version."), context, error::FEATHER_VERSION_UNKNOWN);
+ }
+
+ // Read a Table from the file.
+ std::shared_ptr<arrow::Table> table = nullptr;
+ MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(reader->Read(&table), context,
error::FEATHER_FAILED_TO_READ_TABLE);
+
+ // Get the first RecordBatch from the Table.
+ arrow::TableBatchReader table_batch_reader{table};
+ std::shared_ptr<arrow::RecordBatch> record_batch = nullptr;
+
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(table_batch_reader.ReadNext(&record_batch),
context, error::FEATHER_FAILED_TO_READ_RECORD_BATCH);
+
+ // Create a Proxy from the first RecordBatch.
+ auto record_batch_proxy =
std::make_shared<RecordBatchProxy>(record_batch);
+ const auto record_batch_proxy_id =
ProxyManager::manageProxy(record_batch_proxy);
+
+ const auto record_batch_proxy_id_mda =
factory.createScalar(record_batch_proxy_id);
+
+ context.outputs[0] = record_batch_proxy_id_mda;
+ }
+
+ void Reader::getFilename(libmexclass::proxy::method::Context& context) {
+ namespace mda = ::matlab::data;
+ mda::ArrayFactory factory;
+
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto filename_utf16,
arrow::util::UTF8StringToUTF16(filename), context,
error::UNICODE_CONVERSION_ERROR_ID);
+ auto filename_utf16_mda = factory.createScalar(filename_utf16);
+ context.outputs[0] = filename_utf16_mda;
+ }
+
+}
diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h
similarity index 73%
copy from matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
copy to matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h
index dadb479887..fb6c06de86 100644
--- a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
+++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h
@@ -17,25 +17,23 @@
#pragma once
-#include "arrow/status.h"
-
#include "libmexclass/proxy/Proxy.h"
namespace arrow::matlab::io::feather::proxy {
- class FeatherWriter : public libmexclass::proxy::Proxy {
+ class Reader : public libmexclass::proxy::Proxy {
public:
- FeatherWriter(const std::string& filename);
-
- ~FeatherWriter() {}
+ Reader(const std::string& filename);
+
+ virtual ~Reader() {}
+
+ static libmexclass::proxy::MakeResult make(const
libmexclass::proxy::FunctionArguments& constructor_arguments);
- static libmexclass::proxy::MakeResult make(const
libmexclass::proxy::FunctionArguments& constructor_arguments);
-
protected:
+ void read(libmexclass::proxy::method::Context& context);
void getFilename(libmexclass::proxy::method::Context& context);
- void write(libmexclass::proxy::method::Context& context);
- private:
- const std::string filename;
+ const std::string filename;
};
+
}
diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc
b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.cc
similarity index 86%
rename from matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc
rename to matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.cc
index a27e1fb0e6..c71c9ae7a5 100644
--- a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc
+++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.cc
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-#include "arrow/matlab/io/feather/proxy/feather_writer.h"
+#include "arrow/matlab/io/feather/proxy/writer.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/error/error.h"
@@ -30,12 +30,12 @@
namespace arrow::matlab::io::feather::proxy {
- FeatherWriter::FeatherWriter(const std::string& filename) :
filename{filename} {
- REGISTER_METHOD(FeatherWriter, getFilename);
- REGISTER_METHOD(FeatherWriter, write);
+ Writer::Writer(const std::string& filename) : filename{filename} {
+ REGISTER_METHOD(Writer, getFilename);
+ REGISTER_METHOD(Writer, write);
}
- libmexclass::proxy::MakeResult FeatherWriter::make(const
libmexclass::proxy::FunctionArguments& constructor_arguments) {
+ libmexclass::proxy::MakeResult Writer::make(const
libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;
mda::StructArray opts = constructor_arguments[0];
const mda::StringArray filename_mda = opts[0]["Filename"];
@@ -45,10 +45,10 @@ namespace arrow::matlab::io::feather::proxy {
arrow::util::UTF16StringToUTF8(filename_utf16),
error::UNICODE_CONVERSION_ERROR_ID);
- return std::make_shared<FeatherWriter>(filename_utf8);
+ return std::make_shared<Writer>(filename_utf8);
}
- void FeatherWriter::getFilename(libmexclass::proxy::method::Context&
context) {
+ void Writer::getFilename(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename,
arrow::util::UTF8StringToUTF16(filename),
@@ -59,7 +59,7 @@ namespace arrow::matlab::io::feather::proxy {
context.outputs[0] = str_mda;
}
- void FeatherWriter::write(libmexclass::proxy::method::Context& context) {
+ void Writer::write(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::StructArray opts = context.inputs[0];
const mda::TypedArray<uint64_t> record_batch_proxy_id_mda =
opts[0]["RecordBatchProxyID"];
diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.h
similarity index 89%
rename from matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
rename to matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.h
index dadb479887..21dc70f432 100644
--- a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h
+++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.h
@@ -23,11 +23,11 @@
namespace arrow::matlab::io::feather::proxy {
- class FeatherWriter : public libmexclass::proxy::Proxy {
+ class Writer : public libmexclass::proxy::Proxy {
public:
- FeatherWriter(const std::string& filename);
+ Writer(const std::string& filename);
- ~FeatherWriter() {}
+ ~Writer() {}
static libmexclass::proxy::MakeResult make(const
libmexclass::proxy::FunctionArguments& constructor_arguments);
diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc
b/matlab/src/cpp/arrow/matlab/proxy/factory.cc
index 7a2a4f3192..bce875bb9f 100644
--- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc
+++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc
@@ -25,7 +25,8 @@
#include "arrow/matlab/type/proxy/string_type.h"
#include "arrow/matlab/type/proxy/timestamp_type.h"
#include "arrow/matlab/type/proxy/field.h"
-#include "arrow/matlab/io/feather/proxy/feather_writer.h"
+#include "arrow/matlab/io/feather/proxy/writer.h"
+#include "arrow/matlab/io/feather/proxy/reader.h"
#include "factory.h"
@@ -61,7 +62,8 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const
ClassName& class_name,
REGISTER_PROXY(arrow.type.proxy.BooleanType ,
arrow::matlab::type::proxy::PrimitiveCType<bool>);
REGISTER_PROXY(arrow.type.proxy.StringType ,
arrow::matlab::type::proxy::StringType);
REGISTER_PROXY(arrow.type.proxy.TimestampType ,
arrow::matlab::type::proxy::TimestampType);
- REGISTER_PROXY(arrow.io.feather.proxy.FeatherWriter ,
arrow::matlab::io::feather::proxy::FeatherWriter);
+ REGISTER_PROXY(arrow.io.feather.proxy.Writer ,
arrow::matlab::io::feather::proxy::Writer);
+ REGISTER_PROXY(arrow.io.feather.proxy.Reader ,
arrow::matlab::io::feather::proxy::Reader);
return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not
find matching C++ proxy for " + class_name};
};
diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
b/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m
similarity index 63%
copy from matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
copy to matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m
index 470c41fd5b..80da7294d2 100644
--- a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
+++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m
@@ -1,5 +1,3 @@
-%WRITER Class for writing feather V1 files.
-
% Licensed to the Apache Software Foundation (ASF) under one or more
% contributor license agreements. See the NOTICE file distributed with
% this work for additional information regarding copyright ownership.
@@ -14,35 +12,41 @@
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
% implied. See the License for the specific language governing
% permissions and limitations under the License.
-classdef Writer < matlab.mixin.Scalar
- properties(Hidden, SetAccess=private, GetAccess=public)
+classdef Reader
+%READER An internal Reader object for reading Feather files.
+
+ properties (GetAccess=public, SetAccess=private, Hidden)
Proxy
end
- properties(Dependent)
+ properties (Dependent, SetAccess=private, GetAccess=public)
+ % Name of the file to read.
Filename
end
methods
- function obj = Writer(filename)
+
+ function obj = Reader(filename)
arguments
filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText}
end
args = struct(Filename=filename);
- proxyName = "arrow.io.feather.proxy.FeatherWriter";
- obj.Proxy = arrow.internal.proxy.create(proxyName, args);
+ obj.Proxy =
arrow.internal.proxy.create("arrow.io.feather.proxy.Reader", args);
end
- function write(obj, T)
- rb = arrow.recordbatch(T);
- args = struct(RecordBatchProxyID=rb.Proxy.ID);
- obj.Proxy.write(args);
+ function T = read(obj)
+ recordBatchProxyID = obj.Proxy.read();
+ proxy =
libmexclass.proxy.Proxy(Name="arrow.tabular.proxy.RecordBatch",
ID=recordBatchProxyID);
+ recordBatch = arrow.tabular.RecordBatch(proxy);
+ T = recordBatch.toMATLAB();
end
function filename = get.Filename(obj)
filename = obj.Proxy.getFilename();
end
+
end
-end
\ No newline at end of file
+
+end
diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
index 470c41fd5b..37c785f10a 100644
--- a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
+++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
@@ -31,7 +31,7 @@ classdef Writer < matlab.mixin.Scalar
end
args = struct(Filename=filename);
- proxyName = "arrow.io.feather.proxy.FeatherWriter";
+ proxyName = "arrow.io.feather.proxy.Writer";
obj.Proxy = arrow.internal.proxy.create(proxyName, args);
end
@@ -45,4 +45,4 @@ classdef Writer < matlab.mixin.Scalar
filename = obj.Proxy.getFilename();
end
end
-end
\ No newline at end of file
+end
diff --git a/matlab/test/arrow/io/feather/tRoundTrip.m
b/matlab/test/arrow/io/feather/tRoundTrip.m
index d56152be6d..e735d196c1 100644
--- a/matlab/test/arrow/io/feather/tRoundTrip.m
+++ b/matlab/test/arrow/io/feather/tRoundTrip.m
@@ -49,4 +49,9 @@ end
function featherwrite(T, filename)
writer = arrow.internal.io.feather.Writer(filename);
writer.write(T);
+end
+
+function T = featherread(filename)
+ reader = arrow.internal.io.feather.Reader(filename);
+ T = reader.read();
end
\ No newline at end of file
diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
index 1d57999417..c19740f181 100644
--- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
+++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
@@ -56,8 +56,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES
"${CMAKE_SOURCE_DIR}/src/cpp/a
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc"
-
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc")
-
+
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/writer.cc"
+
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/reader.cc")
set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy")