This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5c12c55  ARROW-14088: [GLib][Ruby][Dataset] Add support for filter
5c12c55 is described below

commit 5c12c55b103d41a85b0b8a0a8993a87fc83e42e7
Author: Sutou Kouhei <[email protected]>
AuthorDate: Mon Oct 18 05:08:54 2021 +0900

    ARROW-14088: [GLib][Ruby][Dataset] Add support for filter
    
    Closes #11442 from kou/glib-dataset-filter
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 c_glib/arrow-dataset-glib/scanner.cpp              | 41 +++++++++++++++++-----
 c_glib/arrow-dataset-glib/scanner.h                |  7 ++--
 c_glib/arrow-glib/compute.cpp                      |  2 +-
 c_glib/test/dataset/test-file-system-dataset.rb    |  2 +-
 .../{test-scanner.rb => test-scanner-builder.rb}   | 34 ++++++++++++++----
 c_glib/test/dataset/test-scanner.rb                |  7 ----
 .../lib/arrow-dataset/arrow-table-loadable.rb      | 11 +++++-
 ruby/red-arrow-dataset/test/test-arrow-table.rb    |  9 +++++
 8 files changed, 85 insertions(+), 28 deletions(-)

diff --git a/c_glib/arrow-dataset-glib/scanner.cpp 
b/c_glib/arrow-dataset-glib/scanner.cpp
index eefd6c7..51542bb 100644
--- a/c_glib/arrow-dataset-glib/scanner.cpp
+++ b/c_glib/arrow-dataset-glib/scanner.cpp
@@ -18,6 +18,7 @@
  */
 
 #include <arrow-glib/error.hpp>
+#include <arrow-glib/expression.hpp>
 #include <arrow-glib/reader.hpp>
 #include <arrow-glib/table.hpp>
 
@@ -138,6 +139,7 @@ typedef struct GADatasetScannerBuilderPrivate_ {
 
 enum {
   PROP_SCANNER_BUILDER = 1,
+  PROP_USE_ASYNC,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder,
@@ -171,6 +173,11 @@ gadataset_scanner_builder_set_property(GObject *object,
       *static_cast<std::shared_ptr<arrow::dataset::ScannerBuilder> *>(
         g_value_get_pointer(value));
     break;
+  case PROP_USE_ASYNC:
+    garrow::check(nullptr,
+                  priv->scanner_builder->UseAsync(g_value_get_boolean(value)),
+                  "[scanner-builder][use-async][set]");
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -199,6 +206,21 @@ 
gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass)
                               static_cast<GParamFlags>(G_PARAM_WRITABLE |
                                                        
G_PARAM_CONSTRUCT_ONLY));
   g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec);
+
+  arrow::dataset::ScanOptions default_options;
+  /**
+   * GADatasetScannerBuilder:use-async:
+   *
+   * Whether or not async mode is used.
+   *
+   * Since: 6.0.0
+   */
+  spec = g_param_spec_boolean("use-async",
+                              "Use async",
+                              "Whether or not async mode is used",
+                              default_options.use_async,
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE));
+  g_object_class_install_property(gobject_class, PROP_USE_ASYNC, spec);
 }
 
 /**
@@ -245,22 +267,25 @@ gadataset_scanner_builder_new_record_batch_reader(
 }
 
 /**
- * gadataset_scanner_builder_use_async:
+ * gadataset_scanner_builder_set_filter:
  * @builder: A #GADatasetScannerBuilder.
- * @use_async: Use the asynchronous scanner
+ * @expression: A #GArrowExpression to filter rows with.
  * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: void
+ * Returns: %TRUE on success, %FALSE on error.
  *
  * Since: 6.0.0
  */
-void
-gadataset_scanner_builder_use_async(GADatasetScannerBuilder *builder, gboolean 
use_async,
-                                 GError **error)
+gboolean
+gadataset_scanner_builder_set_filter(GADatasetScannerBuilder *builder,
+                                     GArrowExpression *expression,
+                                     GError **error)
 {
   auto arrow_builder = gadataset_scanner_builder_get_raw(builder);
-  auto use_async_result = arrow_builder->UseAsync(use_async);
-  garrow::check(error, use_async_result, "[scanner-builder][use_async]");
+  auto arrow_expression = garrow_expression_get_raw(expression);
+  return garrow::check(error,
+                       arrow_builder->Filter(*arrow_expression),
+                       "[scanner-builder][filter][set]");
 }
 
 /**
diff --git a/c_glib/arrow-dataset-glib/scanner.h 
b/c_glib/arrow-dataset-glib/scanner.h
index 4afec00..59da257 100644
--- a/c_glib/arrow-dataset-glib/scanner.h
+++ b/c_glib/arrow-dataset-glib/scanner.h
@@ -61,9 +61,10 @@ gadataset_scanner_builder_new_record_batch_reader(
   GArrowRecordBatchReader *reader);
 
 GARROW_AVAILABLE_IN_6_0
-void
-gadataset_scanner_builder_use_async(
-  GADatasetScannerBuilder *builder, gboolean use_async, GError **error);       
                            
+gboolean
+gadataset_scanner_builder_set_filter(GADatasetScannerBuilder *builder,
+                                     GArrowExpression *expression,
+                                     GError **error);
 
 GARROW_AVAILABLE_IN_5_0
 GADatasetScanner *
diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index 2f4a0de..cdfc96a 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -151,7 +151,7 @@ G_BEGIN_DECLS
  * #GArrowScalarAggregateOptions is a class to customize the scalar
  * aggregate functions such as `count` function and convenient
  * functions of them such as garrow_array_count().
-
+ *
  * #GArrowCountOptions is a class to customize the `count` function and
  * garrow_array_count() family.
  *
diff --git a/c_glib/test/dataset/test-file-system-dataset.rb 
b/c_glib/test/dataset/test-file-system-dataset.rb
index 0e91d75..1aef38f 100644
--- a/c_glib/test/dataset/test-file-system-dataset.rb
+++ b/c_glib/test/dataset/test-file-system-dataset.rb
@@ -60,7 +60,7 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
                         count: build_int32_array([1, 10, 2, 3]))
     table_reader = Arrow::TableBatchReader.new(table)
     scanner_builder = ArrowDataset::ScannerBuilder.new(table_reader)
-    scanner_builder.use_async(true)
+    scanner_builder.use_async = true
     scanner = scanner_builder.finish
     options = ArrowDataset::FileSystemDatasetWriteOptions.new
     options.file_write_options = @format.default_write_options
diff --git a/c_glib/test/dataset/test-scanner.rb 
b/c_glib/test/dataset/test-scanner-builder.rb
similarity index 65%
copy from c_glib/test/dataset/test-scanner.rb
copy to c_glib/test/dataset/test-scanner-builder.rb
index ed6a706..5674db4 100644
--- a/c_glib/test/dataset/test-scanner.rb
+++ b/c_glib/test/dataset/test-scanner-builder.rb
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-class TestDatasetScanner < Test::Unit::TestCase
+class TestDatasetScannerBuilder < Test::Unit::TestCase
   include Helper::Buildable
   include Helper::Writable
 
@@ -36,20 +36,40 @@ class TestDatasetScanner < Test::Unit::TestCase
       factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
       factory.file_system_uri = build_file_uri(path)
       @dataset = factory.finish
-      builder = @dataset.begin_scan
-      @scanner = builder.finish
+      @builder = @dataset.begin_scan
       yield
     end
   end
 
-  def test_to_table
-    assert_equal(@table, @scanner.to_table)
-  end
-
   def test_new_record_batch_reader
     reader = Arrow::TableBatchReader.new(@table)
     builder = ArrowDataset::ScannerBuilder.new(reader)
     scanner = builder.finish
     assert_equal(@table, scanner.to_table)
   end
+
+  def test_filter
+    visible = Arrow::FieldExpression.new("visible")
+    true_scalar = Arrow::BooleanScalar.new(true)
+    true_datum = Arrow::ScalarDatum.new(true_scalar)
+    true_literal = Arrow::LiteralExpression.new(true_datum)
+    filter = Arrow::CallExpression.new("equal", [visible, true_literal])
+    @builder.filter = filter
+    scanner = @builder.finish
+    assert_equal(build_table(visible: [
+                               build_boolean_array([true, true]),
+                               build_boolean_array([true, true]),
+                             ],
+                             point: [
+                               build_int32_array([1, 3]),
+                               build_int32_array([-2, -4]),
+                             ]),
+                 scanner.to_table)
+  end
+
+  def test_use_async
+    @builder.use_async = true
+    scanner = @builder.finish
+    assert_equal(@table, scanner.to_table)
+  end
 end
diff --git a/c_glib/test/dataset/test-scanner.rb 
b/c_glib/test/dataset/test-scanner.rb
index ed6a706..f7702d4 100644
--- a/c_glib/test/dataset/test-scanner.rb
+++ b/c_glib/test/dataset/test-scanner.rb
@@ -45,11 +45,4 @@ class TestDatasetScanner < Test::Unit::TestCase
   def test_to_table
     assert_equal(@table, @scanner.to_table)
   end
-
-  def test_new_record_batch_reader
-    reader = Arrow::TableBatchReader.new(@table)
-    builder = ArrowDataset::ScannerBuilder.new(reader)
-    scanner = builder.finish
-    assert_equal(@table, scanner.to_table)
-  end
 end
diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb 
b/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
index bda8d86..14c8dce 100644
--- a/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
+++ b/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
@@ -40,7 +40,16 @@ module ArrowDataset
       dataset = FileSystemDataset.build(format) do |factory|
         factory.file_system_uri = uri
       end
-      dataset.to_table
+      scanner_builder = dataset.begin_scan
+      @options.each do |key, value|
+        next if key == :format
+        next if value.nil?
+        setter = "#{key}="
+        next unless scanner_builder.respond_to?(setter)
+        scanner_builder.public_send(setter, value)
+      end
+      scanner = scanner_builder.finish
+      scanner.to_table
     end
   end
 end
diff --git a/ruby/red-arrow-dataset/test/test-arrow-table.rb 
b/ruby/red-arrow-dataset/test/test-arrow-table.rb
index a7bbf42..1913063 100644
--- a/ruby/red-arrow-dataset/test/test-arrow-table.rb
+++ b/ruby/red-arrow-dataset/test/test-arrow-table.rb
@@ -67,5 +67,14 @@ class TestArrowTable < Test::Unit::TestCase
       assert_equal(@table1.concatenate([@table2]),
                    Arrow::Table.load(@dir))
     end
+
+    def test_filter
+      @table1.save(build_file_uri(@path1))
+      @table2.save(build_file_uri(@path2))
+      assert_equal(Arrow::Table.new(visible: [true, true, true],
+                                    point: [1, 3, 10]),
+                   Arrow::Table.load(@dir,
+                                     filter: ["equal", :visible, true]))
+    end
   end
 end

Reply via email to