lidavidm commented on a change in pull request #11991:
URL: https://github.com/apache/arrow/pull/11991#discussion_r779543359



##########
File path: r/R/dataset-scan.R
##########
@@ -118,9 +113,6 @@ Scanner$create <- function(dataset,
   if (use_threads) {
     scanner_builder$UseThreads()
   }
-  if (use_async) {

Review comment:
       Is there an R equivalent of a deprecation warning we should be raising?

##########
File path: cpp/src/arrow/dataset/scanner.h
##########
@@ -138,41 +133,40 @@ struct ARROW_DS_EXPORT ScanOptions {
   // This is used by Fragment implementations to apply the column
   // sub-selection optimization.
   std::vector<std::string> MaterializedFields() const;
+};
 
-  // Return a threaded or serial TaskGroup according to use_threads.
-  std::shared_ptr<::arrow::internal::TaskGroup> TaskGroup() const;
+struct ARROW_DS_EXPORT ProjectionDescr {
+  compute::Expression expression;
+  std::shared_ptr<Schema> schema;
 };
 
-/// \brief Read record batches from a range of a single data fragment. A
-/// ScanTask is meant to be a unit of work to be dispatched. The implementation
-/// must be thread and concurrent safe.
-class ARROW_DS_EXPORT ScanTask {
- public:
-  /// \brief Iterate through sequence of materialized record batches
-  /// resulting from the Scan. Execution semantics are encapsulated in the
-  /// particular ScanTask implementation
-  virtual Result<RecordBatchIterator> Execute() = 0;
-  virtual Future<RecordBatchVector> SafeExecute(::arrow::internal::Executor* 
executor);
-  virtual Future<> SafeVisit(::arrow::internal::Executor* executor,
-                             
std::function<Status(std::shared_ptr<RecordBatch>)> visitor);
+/// \brief Create a ProjectionDescr by binding an expression to the dataset 
schema
+///
+/// expression must return a struct
+ARROW_DS_EXPORT Result<ProjectionDescr> MakeProjectionFromStructExpression(
+    const compute::Expression& expression, const Schema& dataset_schema);
 
-  virtual ~ScanTask() = default;
+/// \brief Create a ProjectionDescr from expressions/names for each field
+ARROW_DS_EXPORT Result<ProjectionDescr> MakeProjectionFromFieldExpressions(
+    const std::vector<compute::Expression>& exprs, std::vector<std::string> 
names,
+    const Schema& dataset_schema);
 
-  const std::shared_ptr<ScanOptions>& options() const { return options_; }
-  const std::shared_ptr<Fragment>& fragment() const { return fragment_; }
+/// \brief Create a default projection referencing fields in the dataset schema
+ARROW_DS_EXPORT Result<ProjectionDescr> MakeProjectionFromNames(
+    std::vector<std::string> names, const Schema& dataset_schema);

Review comment:
       Hmm, maybe 1) document the struct itself and 2) make these static 
methods on the struct? (`ProjectionDescr::FromNames` for instance)

##########
File path: cpp/src/arrow/dataset/scanner_test.cc
##########
@@ -706,30 +634,6 @@ TEST_P(TestScanner, Head) {
   AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false);
 }
 
-TEST_P(TestScanner, FromReader) {

Review comment:
       Should this test be kept? The API still exists/is still used and it 
seems it's not true that the async scanner can't be constructed from a reader.

##########
File path: c_glib/arrow-dataset-glib/scanner.cpp
##########
@@ -132,95 +113,59 @@ gadataset_scanner_to_table(GADatasetScanner *scanner,
   }
 }
 
-
 typedef struct GADatasetScannerBuilderPrivate_ {
   std::shared_ptr<arrow::dataset::ScannerBuilder> scanner_builder;
 } GADatasetScannerBuilderPrivate;
 
-enum {
-  PROP_SCANNER_BUILDER = 1,
-  PROP_USE_ASYNC,
-};
+enum { PROP_SCANNER_BUILDER = 1 };
 
-G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder,
-                           gadataset_scanner_builder,
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder, gadataset_scanner_builder,
                            G_TYPE_OBJECT)
 
-#define GADATASET_SCANNER_BUILDER_GET_PRIVATE(obj)        \
-  static_cast<GADatasetScannerBuilderPrivate *>(          \
-    gadataset_scanner_builder_get_instance_private(       \
-      GADATASET_SCANNER_BUILDER(obj)))
+#define GADATASET_SCANNER_BUILDER_GET_PRIVATE(obj) \
+  static_cast<GADatasetScannerBuilderPrivate*>(    \
+      
gadataset_scanner_builder_get_instance_private(GADATASET_SCANNER_BUILDER(obj)))
 
-static void
-gadataset_scanner_builder_finalize(GObject *object)
-{
+static void gadataset_scanner_builder_finalize(GObject* object) {
   auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object);
   priv->scanner_builder.~shared_ptr();
   G_OBJECT_CLASS(gadataset_scanner_builder_parent_class)->finalize(object);
 }
 
-static void
-gadataset_scanner_builder_set_property(GObject *object,
-                                       guint prop_id,
-                                       const GValue *value,
-                                       GParamSpec *pspec)
-{
+static void gadataset_scanner_builder_set_property(GObject* object, guint 
prop_id,
+                                                   const GValue* value,
+                                                   GParamSpec* pspec) {
   auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_SCANNER_BUILDER:
-    priv->scanner_builder =
-      *static_cast<std::shared_ptr<arrow::dataset::ScannerBuilder> *>(
-        g_value_get_pointer(value));
-    break;
-  case PROP_USE_ASYNC:
-    garrow::check(nullptr,
-                  priv->scanner_builder->UseAsync(g_value_get_boolean(value)),
-                  "[scanner-builder][use-async][set]");
-    break;
-  default:
-    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
-    break;
+    case PROP_SCANNER_BUILDER:
+      priv->scanner_builder =
+          *static_cast<std::shared_ptr<arrow::dataset::ScannerBuilder>*>(
+              g_value_get_pointer(value));
+      break;
+    default:
+      G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+      break;
   }
 }
 
-static void
-gadataset_scanner_builder_init(GADatasetScannerBuilder *object)
-{
+static void gadataset_scanner_builder_init(GADatasetScannerBuilder* object) {
   auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object);
-  new(&priv->scanner_builder) std::shared_ptr<arrow::dataset::ScannerBuilder>;
+  new (&priv->scanner_builder) std::shared_ptr<arrow::dataset::ScannerBuilder>;
 }
 
-static void
-gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass)
-{
+static void gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass* 
klass) {
   auto gobject_class = G_OBJECT_CLASS(klass);
-  gobject_class->finalize     = gadataset_scanner_builder_finalize;
+  gobject_class->finalize = gadataset_scanner_builder_finalize;
   gobject_class->set_property = gadataset_scanner_builder_set_property;
 
-  GParamSpec *spec;
-  spec = g_param_spec_pointer("scanner-builder",
-                              "Scanner builder",
-                              "The raw "
-                              "std::shared<arrow::dataset::ScannerBuilder> *",
-                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
-                                                       
G_PARAM_CONSTRUCT_ONLY));
+  GParamSpec* spec;
+  spec = g_param_spec_pointer(
+      "scanner-builder", "Scanner builder",
+      "The raw "
+      "std::shared<arrow::dataset::ScannerBuilder> *",
+      static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
   g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec);
-
-  arrow::dataset::ScanOptions default_options;
-  /**
-   * GADatasetScannerBuilder:use-async:
-   *
-   * Whether or not async mode is used.
-   *
-   * Since: 6.0.0
-   */
-  spec = g_param_spec_boolean("use-async",

Review comment:
       Similarly, is there a way to deprecate things here instead of removing 
it?

##########
File path: c_glib/arrow-dataset-glib/scanner.cpp
##########
@@ -48,63 +48,47 @@ enum {
   PROP_SCANNER = 1,
 };
 
-G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanner,
-                           gadataset_scanner,
-                           G_TYPE_OBJECT)
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanner, gadataset_scanner, G_TYPE_OBJECT)
 
-#define GADATASET_SCANNER_GET_PRIVATE(obj)        \
-  static_cast<GADatasetScannerPrivate *>(         \
-    gadataset_scanner_get_instance_private(       \
-      GADATASET_SCANNER(obj)))
+#define GADATASET_SCANNER_GET_PRIVATE(obj) \
+  static_cast<GADatasetScannerPrivate*>(   \
+      gadataset_scanner_get_instance_private(GADATASET_SCANNER(obj)))

Review comment:
       Did this get autoformatted somehow? (Not sure if there's an expected 
style for the GLib bindings though.)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to