This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5c12c55 ARROW-14088: [GLib][Ruby][Dataset] Add support for filter
5c12c55 is described below
commit 5c12c55b103d41a85b0b8a0a8993a87fc83e42e7
Author: Sutou Kouhei <[email protected]>
AuthorDate: Mon Oct 18 05:08:54 2021 +0900
ARROW-14088: [GLib][Ruby][Dataset] Add support for filter
Closes #11442 from kou/glib-dataset-filter
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
c_glib/arrow-dataset-glib/scanner.cpp | 41 +++++++++++++++++-----
c_glib/arrow-dataset-glib/scanner.h | 7 ++--
c_glib/arrow-glib/compute.cpp | 2 +-
c_glib/test/dataset/test-file-system-dataset.rb | 2 +-
.../{test-scanner.rb => test-scanner-builder.rb} | 34 ++++++++++++++----
c_glib/test/dataset/test-scanner.rb | 7 ----
.../lib/arrow-dataset/arrow-table-loadable.rb | 11 +++++-
ruby/red-arrow-dataset/test/test-arrow-table.rb | 9 +++++
8 files changed, 85 insertions(+), 28 deletions(-)
diff --git a/c_glib/arrow-dataset-glib/scanner.cpp
b/c_glib/arrow-dataset-glib/scanner.cpp
index eefd6c7..51542bb 100644
--- a/c_glib/arrow-dataset-glib/scanner.cpp
+++ b/c_glib/arrow-dataset-glib/scanner.cpp
@@ -18,6 +18,7 @@
*/
#include <arrow-glib/error.hpp>
+#include <arrow-glib/expression.hpp>
#include <arrow-glib/reader.hpp>
#include <arrow-glib/table.hpp>
@@ -138,6 +139,7 @@ typedef struct GADatasetScannerBuilderPrivate_ {
enum {
PROP_SCANNER_BUILDER = 1,
+ PROP_USE_ASYNC,
};
G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder,
@@ -171,6 +173,11 @@ gadataset_scanner_builder_set_property(GObject *object,
*static_cast<std::shared_ptr<arrow::dataset::ScannerBuilder> *>(
g_value_get_pointer(value));
break;
+ case PROP_USE_ASYNC:
+ garrow::check(nullptr,
+ priv->scanner_builder->UseAsync(g_value_get_boolean(value)),
+ "[scanner-builder][use-async][set]");
+ break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
@@ -199,6 +206,21 @@
gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass)
static_cast<GParamFlags>(G_PARAM_WRITABLE |
G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec);
+
+ arrow::dataset::ScanOptions default_options;
+ /**
+ * GADatasetScannerBuilder:use-async:
+ *
+ * Whether or not async mode is used.
+ *
+ * Since: 6.0.0
+ */
+ spec = g_param_spec_boolean("use-async",
+ "Use async",
+ "Whether or not async mode is used",
+ default_options.use_async,
+ static_cast<GParamFlags>(G_PARAM_WRITABLE));
+ g_object_class_install_property(gobject_class, PROP_USE_ASYNC, spec);
}
/**
@@ -245,22 +267,25 @@ gadataset_scanner_builder_new_record_batch_reader(
}
/**
- * gadataset_scanner_builder_use_async:
+ * gadataset_scanner_builder_set_filter:
* @builder: A #GADatasetScannerBuilder.
- * @use_async: Use the asynchronous scanner
+ * @expression: A #GArrowExpression to filter rows with.
* @error: (nullable): Return location for a #GError or %NULL.
*
- * Returns: void
+ * Returns: %TRUE on success, %FALSE on error.
*
* Since: 6.0.0
*/
-void
-gadataset_scanner_builder_use_async(GADatasetScannerBuilder *builder, gboolean
use_async,
- GError **error)
+gboolean
+gadataset_scanner_builder_set_filter(GADatasetScannerBuilder *builder,
+ GArrowExpression *expression,
+ GError **error)
{
auto arrow_builder = gadataset_scanner_builder_get_raw(builder);
- auto use_async_result = arrow_builder->UseAsync(use_async);
- garrow::check(error, use_async_result, "[scanner-builder][use_async]");
+ auto arrow_expression = garrow_expression_get_raw(expression);
+ return garrow::check(error,
+ arrow_builder->Filter(*arrow_expression),
+ "[scanner-builder][filter][set]");
}
/**
diff --git a/c_glib/arrow-dataset-glib/scanner.h
b/c_glib/arrow-dataset-glib/scanner.h
index 4afec00..59da257 100644
--- a/c_glib/arrow-dataset-glib/scanner.h
+++ b/c_glib/arrow-dataset-glib/scanner.h
@@ -61,9 +61,10 @@ gadataset_scanner_builder_new_record_batch_reader(
GArrowRecordBatchReader *reader);
GARROW_AVAILABLE_IN_6_0
-void
-gadataset_scanner_builder_use_async(
- GADatasetScannerBuilder *builder, gboolean use_async, GError **error);
+gboolean
+gadataset_scanner_builder_set_filter(GADatasetScannerBuilder *builder,
+ GArrowExpression *expression,
+ GError **error);
GARROW_AVAILABLE_IN_5_0
GADatasetScanner *
diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index 2f4a0de..cdfc96a 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -151,7 +151,7 @@ G_BEGIN_DECLS
* #GArrowScalarAggregateOptions is a class to customize the scalar
* aggregate functions such as `count` function and convenient
* functions of them such as garrow_array_count().
-
+ *
* #GArrowCountOptions is a class to customize the `count` function and
* garrow_array_count() family.
*
diff --git a/c_glib/test/dataset/test-file-system-dataset.rb
b/c_glib/test/dataset/test-file-system-dataset.rb
index 0e91d75..1aef38f 100644
--- a/c_glib/test/dataset/test-file-system-dataset.rb
+++ b/c_glib/test/dataset/test-file-system-dataset.rb
@@ -60,7 +60,7 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase
count: build_int32_array([1, 10, 2, 3]))
table_reader = Arrow::TableBatchReader.new(table)
scanner_builder = ArrowDataset::ScannerBuilder.new(table_reader)
- scanner_builder.use_async(true)
+ scanner_builder.use_async = true
scanner = scanner_builder.finish
options = ArrowDataset::FileSystemDatasetWriteOptions.new
options.file_write_options = @format.default_write_options
diff --git a/c_glib/test/dataset/test-scanner.rb
b/c_glib/test/dataset/test-scanner-builder.rb
similarity index 65%
copy from c_glib/test/dataset/test-scanner.rb
copy to c_glib/test/dataset/test-scanner-builder.rb
index ed6a706..5674db4 100644
--- a/c_glib/test/dataset/test-scanner.rb
+++ b/c_glib/test/dataset/test-scanner-builder.rb
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-class TestDatasetScanner < Test::Unit::TestCase
+class TestDatasetScannerBuilder < Test::Unit::TestCase
include Helper::Buildable
include Helper::Writable
@@ -36,20 +36,40 @@ class TestDatasetScanner < Test::Unit::TestCase
factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
factory.file_system_uri = build_file_uri(path)
@dataset = factory.finish
- builder = @dataset.begin_scan
- @scanner = builder.finish
+ @builder = @dataset.begin_scan
yield
end
end
- def test_to_table
- assert_equal(@table, @scanner.to_table)
- end
-
def test_new_record_batch_reader
reader = Arrow::TableBatchReader.new(@table)
builder = ArrowDataset::ScannerBuilder.new(reader)
scanner = builder.finish
assert_equal(@table, scanner.to_table)
end
+
+ def test_filter
+ visible = Arrow::FieldExpression.new("visible")
+ true_scalar = Arrow::BooleanScalar.new(true)
+ true_datum = Arrow::ScalarDatum.new(true_scalar)
+ true_literal = Arrow::LiteralExpression.new(true_datum)
+ filter = Arrow::CallExpression.new("equal", [visible, true_literal])
+ @builder.filter = filter
+ scanner = @builder.finish
+ assert_equal(build_table(visible: [
+ build_boolean_array([true, true]),
+ build_boolean_array([true, true]),
+ ],
+ point: [
+ build_int32_array([1, 3]),
+ build_int32_array([-2, -4]),
+ ]),
+ scanner.to_table)
+ end
+
+ def test_use_async
+ @builder.use_async = true
+ scanner = @builder.finish
+ assert_equal(@table, scanner.to_table)
+ end
end
diff --git a/c_glib/test/dataset/test-scanner.rb
b/c_glib/test/dataset/test-scanner.rb
index ed6a706..f7702d4 100644
--- a/c_glib/test/dataset/test-scanner.rb
+++ b/c_glib/test/dataset/test-scanner.rb
@@ -45,11 +45,4 @@ class TestDatasetScanner < Test::Unit::TestCase
def test_to_table
assert_equal(@table, @scanner.to_table)
end
-
- def test_new_record_batch_reader
- reader = Arrow::TableBatchReader.new(@table)
- builder = ArrowDataset::ScannerBuilder.new(reader)
- scanner = builder.finish
- assert_equal(@table, scanner.to_table)
- end
end
diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
b/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
index bda8d86..14c8dce 100644
--- a/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
+++ b/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
@@ -40,7 +40,16 @@ module ArrowDataset
dataset = FileSystemDataset.build(format) do |factory|
factory.file_system_uri = uri
end
- dataset.to_table
+ scanner_builder = dataset.begin_scan
+ @options.each do |key, value|
+ next if key == :format
+ next if value.nil?
+ setter = "#{key}="
+ next unless scanner_builder.respond_to?(setter)
+ scanner_builder.public_send(setter, value)
+ end
+ scanner = scanner_builder.finish
+ scanner.to_table
end
end
end
diff --git a/ruby/red-arrow-dataset/test/test-arrow-table.rb
b/ruby/red-arrow-dataset/test/test-arrow-table.rb
index a7bbf42..1913063 100644
--- a/ruby/red-arrow-dataset/test/test-arrow-table.rb
+++ b/ruby/red-arrow-dataset/test/test-arrow-table.rb
@@ -67,5 +67,14 @@ class TestArrowTable < Test::Unit::TestCase
assert_equal(@table1.concatenate([@table2]),
Arrow::Table.load(@dir))
end
+
+ def test_filter
+ @table1.save(build_file_uri(@path1))
+ @table2.save(build_file_uri(@path2))
+ assert_equal(Arrow::Table.new(visible: [true, true, true],
+ point: [1, 3, 10]),
+ Arrow::Table.load(@dir,
+ filter: ["equal", :visible, true]))
+ end
end
end