This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1c9e393b73 GH-41749: [GLib] Allow getting a RecordBatchReader from a
Dataset or Scanner (#41750)
1c9e393b73 is described below
commit 1c9e393b73195840960dfb9eca8c0dc390be751a
Author: Adam Reeve <[email protected]>
AuthorDate: Sun May 26 09:43:52 2024 +1200
GH-41749: [GLib] Allow getting a RecordBatchReader from a Dataset or
Scanner (#41750)
### Rationale for this change
See #41749
### What changes are included in this PR?
Adds `to_reader` methods to `GADatasetDataset` and `GADatasetScanner`.
### Are these changes tested?
Yes I've added new unit tests.
### Are there any user-facing changes?
Yes this is a new feature.
* GitHub Issue: #41749
Lead-authored-by: Adam Reeve <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
c_glib/arrow-dataset-glib/dataset.cpp | 37 ++++++++++++++++++++++++-
c_glib/arrow-dataset-glib/dataset.h | 3 ++
c_glib/arrow-dataset-glib/scanner.cpp | 22 +++++++++++++++
c_glib/arrow-dataset-glib/scanner.h | 4 +++
c_glib/test/dataset/test-file-system-dataset.rb | 24 ++++++++++++++--
c_glib/test/dataset/test-scanner.rb | 10 +++++++
6 files changed, 96 insertions(+), 4 deletions(-)
diff --git a/c_glib/arrow-dataset-glib/dataset.cpp
b/c_glib/arrow-dataset-glib/dataset.cpp
index 704d6b589e..f84e4e3db3 100644
--- a/c_glib/arrow-dataset-glib/dataset.cpp
+++ b/c_glib/arrow-dataset-glib/dataset.cpp
@@ -19,6 +19,7 @@
#include <arrow-glib/error.hpp>
#include <arrow-glib/file-system.hpp>
+#include <arrow-glib/reader.hpp>
#include <arrow-glib/table.hpp>
#include <arrow-dataset-glib/dataset-factory.hpp>
@@ -152,12 +153,46 @@ gadataset_dataset_to_table(GADatasetDataset *dataset,
GError **error)
}
auto arrow_scanner = *arrow_scanner_result;
auto arrow_table_result = arrow_scanner->ToTable();
- if (!garrow::check(error, arrow_scanner_result, "[dataset][to-table]")) {
+ if (!garrow::check(error, arrow_table_result, "[dataset][to-table]")) {
return NULL;
}
return garrow_table_new_raw(&(*arrow_table_result));
}
+/**
+ * gadataset_dataset_to_record_batch_reader:
+ * @dataset: A #GADatasetDataset.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ * A #GArrowRecordBatchReader on success, %NULL on error.
+ *
+ * Since: 17.0.0
+ */
+GArrowRecordBatchReader *
+gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError
**error)
+{
+ auto arrow_dataset = gadataset_dataset_get_raw(dataset);
+ auto arrow_scanner_builder_result = arrow_dataset->NewScan();
+ if (!garrow::check(error,
+ arrow_scanner_builder_result,
+ "[dataset][to-record-batch-reader]")) {
+ return nullptr;
+ }
+ auto arrow_scanner_builder = *arrow_scanner_builder_result;
+ auto arrow_scanner_result = arrow_scanner_builder->Finish();
+ if (!garrow::check(error, arrow_scanner_result,
"[dataset][to-record-batch-reader]")) {
+ return nullptr;
+ }
+ auto arrow_scanner = *arrow_scanner_result;
+ auto arrow_reader_result = arrow_scanner->ToRecordBatchReader();
+ if (!garrow::check(error, arrow_reader_result,
"[dataset][to-record-batch-reader]")) {
+ return nullptr;
+ }
+ auto sources = g_list_prepend(nullptr, dataset);
+ return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources);
+}
+
/**
* gadataset_dataset_get_type_name:
* @dataset: A #GADatasetDataset.
diff --git a/c_glib/arrow-dataset-glib/dataset.h
b/c_glib/arrow-dataset-glib/dataset.h
index 657de330e6..5b957f0538 100644
--- a/c_glib/arrow-dataset-glib/dataset.h
+++ b/c_glib/arrow-dataset-glib/dataset.h
@@ -34,6 +34,9 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError
**error);
GADATASET_AVAILABLE_IN_5_0
gchar *
gadataset_dataset_get_type_name(GADatasetDataset *dataset);
+GADATASET_AVAILABLE_IN_17_0
+GArrowRecordBatchReader *
+gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError
**error);
#define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS
\
(gadataset_file_system_dataset_write_options_get_type())
diff --git a/c_glib/arrow-dataset-glib/scanner.cpp
b/c_glib/arrow-dataset-glib/scanner.cpp
index 717532db92..28af1f16e5 100644
--- a/c_glib/arrow-dataset-glib/scanner.cpp
+++ b/c_glib/arrow-dataset-glib/scanner.cpp
@@ -128,6 +128,28 @@ gadataset_scanner_to_table(GADatasetScanner *scanner,
GError **error)
}
}
+/**
+ * gadataset_scanner_to_record_batch_reader:
+ * @scanner: A #GADatasetScanner.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ * A #GArrowRecordBatchReader on success, %NULL on error.
+ *
+ * Since: 17.0.0
+ */
+GArrowRecordBatchReader *
+gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError
**error)
+{
+ auto arrow_scanner = gadataset_scanner_get_raw(scanner);
+ auto arrow_reader_result = arrow_scanner->ToRecordBatchReader();
+ if (!garrow::check(error, arrow_reader_result,
"[scanner][to-record-batch-reader]")) {
+ return nullptr;
+ }
+ auto sources = g_list_prepend(nullptr, scanner);
+ return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources);
+}
+
typedef struct GADatasetScannerBuilderPrivate_
{
std::shared_ptr<arrow::dataset::ScannerBuilder> scanner_builder;
diff --git a/c_glib/arrow-dataset-glib/scanner.h
b/c_glib/arrow-dataset-glib/scanner.h
index ad46239156..d92eca5ab8 100644
--- a/c_glib/arrow-dataset-glib/scanner.h
+++ b/c_glib/arrow-dataset-glib/scanner.h
@@ -37,6 +37,10 @@ GADATASET_AVAILABLE_IN_5_0
GArrowTable *
gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error);
+GADATASET_AVAILABLE_IN_17_0
+GArrowRecordBatchReader *
+gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError
**error);
+
#define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type())
GADATASET_AVAILABLE_IN_5_0
G_DECLARE_DERIVABLE_TYPE(
diff --git a/c_glib/test/dataset/test-file-system-dataset.rb
b/c_glib/test/dataset/test-file-system-dataset.rb
index 0e856b678f..96deedf6b4 100644
--- a/c_glib/test/dataset/test-file-system-dataset.rb
+++ b/c_glib/test/dataset/test-file-system-dataset.rb
@@ -56,6 +56,22 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
end
def test_read_write
+ dataset, expected_table = create_dataset
+ assert_equal(expected_table, dataset.to_table)
+ end
+
+ def test_to_record_batch_reader
+ dataset, expected_table = create_dataset
+ reader = dataset.to_record_batch_reader
+ begin
+ assert_equal(expected_table, reader.read_all)
+ ensure
+ # Unref to ensure the reader closes files and we can delete the temp
directory
+ reader.unref
+ end
+ end
+
+ def create_dataset
table = build_table(label: build_string_array(["a", "a", "b", "c"]),
count: build_int32_array([1, 10, 2, 3]))
table_reader = Arrow::TableBatchReader.new(table)
@@ -73,7 +89,8 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
end
@factory.partition_base_dir = @dir
dataset = @factory.finish
- assert_equal(build_table(count: [
+
+ expected_table = build_table(count: [
build_int32_array([1, 10]),
build_int32_array([2]),
build_int32_array([3]),
@@ -82,7 +99,8 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
build_string_array(["a", "a"]),
build_string_array(["b"]),
build_string_array(["c"]),
- ]),
- dataset.to_table)
+ ])
+
+ return dataset, expected_table
end
end
diff --git a/c_glib/test/dataset/test-scanner.rb
b/c_glib/test/dataset/test-scanner.rb
index f7702d4905..5dc31eefc5 100644
--- a/c_glib/test/dataset/test-scanner.rb
+++ b/c_glib/test/dataset/test-scanner.rb
@@ -45,4 +45,14 @@ class TestDatasetScanner < Test::Unit::TestCase
def test_to_table
assert_equal(@table, @scanner.to_table)
end
+
+ def test_to_record_batch_reader
+ reader = @scanner.to_record_batch_reader
+ begin
+ assert_equal(@table, reader.read_all)
+ ensure
+ # Unref to ensure the reader closes files and we can delete the temp
directory
+ reader.unref
+ end
+ end
end