This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 1c9e393b73 GH-41749: [GLib] Allow getting a RecordBatchReader from a 
Dataset or Scanner (#41750)
1c9e393b73 is described below

commit 1c9e393b73195840960dfb9eca8c0dc390be751a
Author: Adam Reeve <[email protected]>
AuthorDate: Sun May 26 09:43:52 2024 +1200

    GH-41749: [GLib] Allow getting a RecordBatchReader from a Dataset or 
Scanner (#41750)
    
    ### Rationale for this change
    
    See #41749
    
    ### What changes are included in this PR?
    
    Adds `to_reader` methods to `GADatasetDataset` and `GADatasetScanner`.
    
    ### Are these changes tested?
    
    Yes I've added new unit tests.
    
    ### Are there any user-facing changes?
    
    Yes this is a new feature.
    * GitHub Issue: #41749
    
    Lead-authored-by: Adam Reeve <[email protected]>
    Co-authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 c_glib/arrow-dataset-glib/dataset.cpp           | 37 ++++++++++++++++++++++++-
 c_glib/arrow-dataset-glib/dataset.h             |  3 ++
 c_glib/arrow-dataset-glib/scanner.cpp           | 22 +++++++++++++++
 c_glib/arrow-dataset-glib/scanner.h             |  4 +++
 c_glib/test/dataset/test-file-system-dataset.rb | 24 ++++++++++++++--
 c_glib/test/dataset/test-scanner.rb             | 10 +++++++
 6 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/c_glib/arrow-dataset-glib/dataset.cpp 
b/c_glib/arrow-dataset-glib/dataset.cpp
index 704d6b589e..f84e4e3db3 100644
--- a/c_glib/arrow-dataset-glib/dataset.cpp
+++ b/c_glib/arrow-dataset-glib/dataset.cpp
@@ -19,6 +19,7 @@
 
 #include <arrow-glib/error.hpp>
 #include <arrow-glib/file-system.hpp>
+#include <arrow-glib/reader.hpp>
 #include <arrow-glib/table.hpp>
 
 #include <arrow-dataset-glib/dataset-factory.hpp>
@@ -152,12 +153,46 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, 
GError **error)
   }
   auto arrow_scanner = *arrow_scanner_result;
   auto arrow_table_result = arrow_scanner->ToTable();
-  if (!garrow::check(error, arrow_scanner_result, "[dataset][to-table]")) {
+  if (!garrow::check(error, arrow_table_result, "[dataset][to-table]")) {
     return NULL;
   }
   return garrow_table_new_raw(&(*arrow_table_result));
 }
 
+/**
+ * gadataset_dataset_to_record_batch_reader:
+ * @dataset: A #GADatasetDataset.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ *   A #GArrowRecordBatchReader on success, %NULL on error.
+ *
+ * Since: 17.0.0
+ */
+GArrowRecordBatchReader *
+gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError 
**error)
+{
+  auto arrow_dataset = gadataset_dataset_get_raw(dataset);
+  auto arrow_scanner_builder_result = arrow_dataset->NewScan();
+  if (!garrow::check(error,
+                     arrow_scanner_builder_result,
+                     "[dataset][to-record-batch-reader]")) {
+    return nullptr;
+  }
+  auto arrow_scanner_builder = *arrow_scanner_builder_result;
+  auto arrow_scanner_result = arrow_scanner_builder->Finish();
+  if (!garrow::check(error, arrow_scanner_result, 
"[dataset][to-record-batch-reader]")) {
+    return nullptr;
+  }
+  auto arrow_scanner = *arrow_scanner_result;
+  auto arrow_reader_result = arrow_scanner->ToRecordBatchReader();
+  if (!garrow::check(error, arrow_reader_result, 
"[dataset][to-record-batch-reader]")) {
+    return nullptr;
+  }
+  auto sources = g_list_prepend(nullptr, dataset);
+  return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources);
+}
+
 /**
  * gadataset_dataset_get_type_name:
  * @dataset: A #GADatasetDataset.
diff --git a/c_glib/arrow-dataset-glib/dataset.h 
b/c_glib/arrow-dataset-glib/dataset.h
index 657de330e6..5b957f0538 100644
--- a/c_glib/arrow-dataset-glib/dataset.h
+++ b/c_glib/arrow-dataset-glib/dataset.h
@@ -34,6 +34,9 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError 
**error);
 GADATASET_AVAILABLE_IN_5_0
 gchar *
 gadataset_dataset_get_type_name(GADatasetDataset *dataset);
+GADATASET_AVAILABLE_IN_17_0
+GArrowRecordBatchReader *
+gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError 
**error);
 
 #define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS                       
          \
   (gadataset_file_system_dataset_write_options_get_type())
diff --git a/c_glib/arrow-dataset-glib/scanner.cpp 
b/c_glib/arrow-dataset-glib/scanner.cpp
index 717532db92..28af1f16e5 100644
--- a/c_glib/arrow-dataset-glib/scanner.cpp
+++ b/c_glib/arrow-dataset-glib/scanner.cpp
@@ -128,6 +128,28 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, 
GError **error)
   }
 }
 
+/**
+ * gadataset_scanner_to_record_batch_reader:
+ * @scanner: A #GADatasetScanner.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ *   A #GArrowRecordBatchReader on success, %NULL on error.
+ *
+ * Since: 17.0.0
+ */
+GArrowRecordBatchReader *
+gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError 
**error)
+{
+  auto arrow_scanner = gadataset_scanner_get_raw(scanner);
+  auto arrow_reader_result = arrow_scanner->ToRecordBatchReader();
+  if (!garrow::check(error, arrow_reader_result, 
"[scanner][to-record-batch-reader]")) {
+    return nullptr;
+  }
+  auto sources = g_list_prepend(nullptr, scanner);
+  return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources);
+}
+
 typedef struct GADatasetScannerBuilderPrivate_
 {
   std::shared_ptr<arrow::dataset::ScannerBuilder> scanner_builder;
diff --git a/c_glib/arrow-dataset-glib/scanner.h 
b/c_glib/arrow-dataset-glib/scanner.h
index ad46239156..d92eca5ab8 100644
--- a/c_glib/arrow-dataset-glib/scanner.h
+++ b/c_glib/arrow-dataset-glib/scanner.h
@@ -37,6 +37,10 @@ GADATASET_AVAILABLE_IN_5_0
 GArrowTable *
 gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error);
 
+GADATASET_AVAILABLE_IN_17_0
+GArrowRecordBatchReader *
+gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError 
**error);
+
 #define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type())
 GADATASET_AVAILABLE_IN_5_0
 G_DECLARE_DERIVABLE_TYPE(
diff --git a/c_glib/test/dataset/test-file-system-dataset.rb 
b/c_glib/test/dataset/test-file-system-dataset.rb
index 0e856b678f..96deedf6b4 100644
--- a/c_glib/test/dataset/test-file-system-dataset.rb
+++ b/c_glib/test/dataset/test-file-system-dataset.rb
@@ -56,6 +56,22 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
   end
 
   def test_read_write
+    dataset, expected_table = create_dataset
+    assert_equal(expected_table, dataset.to_table)
+  end
+
+  def test_to_record_batch_reader
+    dataset, expected_table = create_dataset
+    reader = dataset.to_record_batch_reader
+    begin
+      assert_equal(expected_table, reader.read_all)
+    ensure
+      # Unref to ensure the reader closes files and we can delete the temp 
directory
+      reader.unref
+    end
+  end
+
+  def create_dataset
     table = build_table(label: build_string_array(["a", "a", "b", "c"]),
                         count: build_int32_array([1, 10, 2, 3]))
     table_reader = Arrow::TableBatchReader.new(table)
@@ -73,7 +89,8 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
     end
     @factory.partition_base_dir = @dir
     dataset = @factory.finish
-    assert_equal(build_table(count: [
+
+    expected_table = build_table(count: [
                                build_int32_array([1, 10]),
                                build_int32_array([2]),
                                build_int32_array([3]),
@@ -82,7 +99,8 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
                                build_string_array(["a", "a"]),
                                build_string_array(["b"]),
                                build_string_array(["c"]),
-                             ]),
-                 dataset.to_table)
+                             ])
+
+    return dataset, expected_table
   end
 end
diff --git a/c_glib/test/dataset/test-scanner.rb 
b/c_glib/test/dataset/test-scanner.rb
index f7702d4905..5dc31eefc5 100644
--- a/c_glib/test/dataset/test-scanner.rb
+++ b/c_glib/test/dataset/test-scanner.rb
@@ -45,4 +45,14 @@ class TestDatasetScanner < Test::Unit::TestCase
   def test_to_table
     assert_equal(@table, @scanner.to_table)
   end
+
+  def test_to_record_batch_reader
+    reader = @scanner.to_record_batch_reader
+    begin
+      assert_equal(@table, reader.read_all)
+    ensure
+      # Unref to ensure the reader closes files and we can delete the temp 
directory
+      reader.unref
+    end
+  end
 end

Reply via email to