lidavidm commented on a change in pull request #11991:
URL: https://github.com/apache/arrow/pull/11991#discussion_r780484531



##########
File path: cpp/src/arrow/dataset/scanner.h
##########
@@ -138,41 +133,46 @@ struct ARROW_DS_EXPORT ScanOptions {
   // This is used by Fragment implementations to apply the column
   // sub-selection optimization.
   std::vector<std::string> MaterializedFields() const;
-
-  // Return a threaded or serial TaskGroup according to use_threads.
-  std::shared_ptr<::arrow::internal::TaskGroup> TaskGroup() const;
 };
 
-/// \brief Read record batches from a range of a single data fragment. A
-/// ScanTask is meant to be a unit of work to be dispatched. The implementation
-/// must be thread and concurrent safe.
-class ARROW_DS_EXPORT ScanTask {
- public:
-  /// \brief Iterate through sequence of materialized record batches
-  /// resulting from the Scan. Execution semantics are encapsulated in the
-  /// particular ScanTask implementation
-  virtual Result<RecordBatchIterator> Execute() = 0;
-  virtual Future<RecordBatchVector> SafeExecute(::arrow::internal::Executor* 
executor);
-  virtual Future<> SafeVisit(::arrow::internal::Executor* executor,
-                             
std::function<Status(std::shared_ptr<RecordBatch>)> visitor);
+/// \brief Describes a projection
+struct ARROW_DS_EXPORT ProjectionDescr {
+  /// \brief The projection expression itself
+  /// This expression must be a call to make_struct
+  compute::Expression expression;
+  /// \brief The output schema of the projection.
 
-  virtual ~ScanTask() = default;
+  /// This can be calculated from the input schema and the expression but it
+  /// is cached here for convenience.
+  std::shared_ptr<Schema> schema;
 
-  const std::shared_ptr<ScanOptions>& options() const { return options_; }
-  const std::shared_ptr<Fragment>& fragment() const { return fragment_; }
+  /// \brief Create a ProjectionDescr by binding an expression to the dataset 
schema
+  ///
+  /// expression must return a struct type
+  static Result<ProjectionDescr> FromStructExpression(
+      const compute::Expression& expression, const Schema& dataset_schema);
 
- protected:
-  ScanTask(std::shared_ptr<ScanOptions> options, std::shared_ptr<Fragment> 
fragment)
-      : options_(std::move(options)), fragment_(std::move(fragment)) {}
+  /// \brief Create a ProjectionDescr from expressions/names for each field
+  static Result<ProjectionDescr> FromExpressions(
+      const std::vector<compute::Expression>& exprs, std::vector<std::string> 
names,
+      const Schema& dataset_schema);
 
-  std::shared_ptr<ScanOptions> options_;
-  std::shared_ptr<Fragment> fragment_;
+  /// \brief Create a default projection referencing fields in the dataset 
schema
+  static Result<ProjectionDescr> FromNames(std::vector<std::string> names,
+                                           const Schema& dataset_schema);
+
+  /// \brief Make a projection that projects every field in the dataset schema
+  static Result<ProjectionDescr> Default(const Schema& dataset_schema);
 };
 
+/// \brief Utility method to set the projection expression and schema
+ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr 
projection);

Review comment:
       Ah, that's fair.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to