[GitHub] [arrow] fsaintjacques commented on a change in pull request #7073: ARROW-8318: [C++][Dataset] Construct FileSystemDataset from fragments

2020-05-01 Thread GitBox


fsaintjacques commented on a change in pull request #7073:
URL: https://github.com/apache/arrow/pull/7073#discussion_r418564299



##
File path: cpp/src/arrow/dataset/file_base.cc
##
@@ -83,131 +83,67 @@ Result 
FileFragment::Scan(std::shared_ptr options
 
 FileSystemDataset::FileSystemDataset(std::shared_ptr schema,
  std::shared_ptr 
root_partition,
- std::shared_ptr format,
- std::shared_ptr 
filesystem,
- fs::PathForest forest,
- ExpressionVector file_partitions)
+ 
std::vector> fragments)
 : Dataset(std::move(schema), std::move(root_partition)),
-  format_(std::move(format)),
-  filesystem_(std::move(filesystem)),
-  forest_(std::move(forest)),
-  partitions_(std::move(file_partitions)) {
-  DCHECK_EQ(static_cast(forest_.size()), partitions_.size());
-}
-
-Result> FileSystemDataset::Make(
-std::shared_ptr schema, std::shared_ptr root_partition,
-std::shared_ptr format, std::shared_ptr 
filesystem,
-std::vector infos) {
-  ExpressionVector partitions(infos.size(), scalar(true));
-  return Make(std::move(schema), std::move(root_partition), std::move(format),
-  std::move(filesystem), std::move(infos), std::move(partitions));
-}
+  fragments_(std::move(fragments)) {}
 
 Result> FileSystemDataset::Make(
 std::shared_ptr schema, std::shared_ptr root_partition,
-std::shared_ptr format, std::shared_ptr 
filesystem,
-std::vector infos, ExpressionVector partitions) {
-  ARROW_ASSIGN_OR_RAISE(auto forest, fs::PathForest::Make(std::move(infos), 
));
-  return Make(std::move(schema), std::move(root_partition), std::move(format),
-  std::move(filesystem), std::move(forest), std::move(partitions));
-}
+FragmentVector fragments) {
+  std::vector> file_fragments;
+  for (const auto& fragment : fragments) {
+auto file_fragment = 
internal::checked_pointer_cast(fragment);

Review comment:
   Resolved by changing the to vector and moving the checks 
at callsite.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] fsaintjacques commented on a change in pull request #7073: ARROW-8318: [C++][Dataset] Construct FileSystemDataset from fragments

2020-05-01 Thread GitBox


fsaintjacques commented on a change in pull request #7073:
URL: https://github.com/apache/arrow/pull/7073#discussion_r418556583



##
File path: cpp/src/arrow/dataset/file_base.cc
##
@@ -221,42 +157,34 @@ Result> 
FileSystemDataset::Write(
 filesystem = std::make_shared();
   }
 
-  std::vector files(plan.paths.size());
-  ExpressionVector partition_expressions(plan.paths.size(), scalar(true));
   auto task_group = scan_context->TaskGroup();
-
   auto partition_base_dir = 
fs::internal::EnsureTrailingSlash(plan.partition_base_dir);
   auto extension = "." + plan.format->type_name();
 
+  FragmentVector fragments;
   for (size_t i = 0; i < plan.paths.size(); ++i) {
 const auto& op = plan.fragment_or_partition_expressions[i];
-if (util::holds_alternative>(op)) {
-  files[i].set_type(fs::FileType::Directory);
-  files[i].set_path(partition_base_dir + plan.paths[i]);
+if (util::holds_alternative>(op)) {

Review comment:
   This will be done part of ARROW-8382.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] fsaintjacques commented on a change in pull request #7073: ARROW-8318: [C++][Dataset] Construct FileSystemDataset from fragments

2020-05-01 Thread GitBox


fsaintjacques commented on a change in pull request #7073:
URL: https://github.com/apache/arrow/pull/7073#discussion_r418556583



##
File path: cpp/src/arrow/dataset/file_base.cc
##
@@ -221,42 +157,34 @@ Result> 
FileSystemDataset::Write(
 filesystem = std::make_shared();
   }
 
-  std::vector files(plan.paths.size());
-  ExpressionVector partition_expressions(plan.paths.size(), scalar(true));
   auto task_group = scan_context->TaskGroup();
-
   auto partition_base_dir = 
fs::internal::EnsureTrailingSlash(plan.partition_base_dir);
   auto extension = "." + plan.format->type_name();
 
+  FragmentVector fragments;
   for (size_t i = 0; i < plan.paths.size(); ++i) {
 const auto& op = plan.fragment_or_partition_expressions[i];
-if (util::holds_alternative>(op)) {
-  files[i].set_type(fs::FileType::Directory);
-  files[i].set_path(partition_base_dir + plan.paths[i]);
+if (util::holds_alternative>(op)) {

Review comment:
   This will be part of ARROW-8382.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] fsaintjacques commented on a change in pull request #7073: ARROW-8318: [C++][Dataset] Construct FileSystemDataset from fragments

2020-05-01 Thread GitBox


fsaintjacques commented on a change in pull request #7073:
URL: https://github.com/apache/arrow/pull/7073#discussion_r418554202



##
File path: cpp/src/arrow/dataset/file_base.cc
##
@@ -83,131 +83,67 @@ Result 
FileFragment::Scan(std::shared_ptr options
 
 FileSystemDataset::FileSystemDataset(std::shared_ptr schema,
  std::shared_ptr 
root_partition,
- std::shared_ptr format,
- std::shared_ptr 
filesystem,
- fs::PathForest forest,
- ExpressionVector file_partitions)
+ 
std::vector> fragments)
 : Dataset(std::move(schema), std::move(root_partition)),
-  format_(std::move(format)),
-  filesystem_(std::move(filesystem)),
-  forest_(std::move(forest)),
-  partitions_(std::move(file_partitions)) {
-  DCHECK_EQ(static_cast(forest_.size()), partitions_.size());
-}
-
-Result> FileSystemDataset::Make(
-std::shared_ptr schema, std::shared_ptr root_partition,
-std::shared_ptr format, std::shared_ptr 
filesystem,
-std::vector infos) {
-  ExpressionVector partitions(infos.size(), scalar(true));
-  return Make(std::move(schema), std::move(root_partition), std::move(format),
-  std::move(filesystem), std::move(infos), std::move(partitions));
-}
+  fragments_(std::move(fragments)) {}
 
 Result> FileSystemDataset::Make(
 std::shared_ptr schema, std::shared_ptr root_partition,
-std::shared_ptr format, std::shared_ptr 
filesystem,
-std::vector infos, ExpressionVector partitions) {
-  ARROW_ASSIGN_OR_RAISE(auto forest, fs::PathForest::Make(std::move(infos), 
));
-  return Make(std::move(schema), std::move(root_partition), std::move(format),
-  std::move(filesystem), std::move(forest), std::move(partitions));
-}
+FragmentVector fragments) {
+  std::vector> file_fragments;
+  for (const auto& fragment : fragments) {
+auto file_fragment = 
internal::checked_pointer_cast(fragment);

Review comment:
   derp, I actually forgot to check.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org