lidavidm commented on code in PR #12977:
URL: https://github.com/apache/arrow/pull/12977#discussion_r859144782


##########
cpp/src/arrow/dataset/partition.h:
##########
@@ -38,6 +38,10 @@ namespace dataset {
 
 constexpr char kFilenamePartitionSep = '_';
 
+struct PartitionPathFormat {

Review Comment:
   ```suggestion
   struct ARROW_DS_EXPORT PartitionPathFormat {
   ```



##########
cpp/src/arrow/dataset/partition.h:
##########
@@ -76,11 +80,8 @@ class ARROW_DS_EXPORT Partitioning {
       const std::shared_ptr<RecordBatch>& batch) const = 0;
 
   /// \brief Parse a path into a partition expression
-  virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
-
-  struct PartitionPathFormat {
-    std::string directory, prefix;
-  };
+  virtual Result<compute::Expression> Parse(const std::string& directory = "",
+                                            const std::string& prefix = "") 
const = 0;

Review Comment:
   Why do we need the default parameter values?



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -847,7 +847,8 @@ ParquetDatasetFactory::CollectParquetFragments(const 
Partitioning& partitioning)
     auto row_groups = Iota(metadata_subset->num_row_groups());
 
     auto partition_expression =
-        partitioning.Parse(StripPrefixAndFilename(path, 
options_.partition_base_dir))
+        partitioning
+            .Parse(StripPrefixAndFilename(path, 
options_.partition_base_dir).directory)

Review Comment:
   Why aren't we passing both components?



##########
cpp/src/arrow/dataset/partition.h:
##########
@@ -76,11 +80,8 @@ class ARROW_DS_EXPORT Partitioning {
       const std::shared_ptr<RecordBatch>& batch) const = 0;
 
   /// \brief Parse a path into a partition expression
-  virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
-
-  struct PartitionPathFormat {
-    std::string directory, prefix;
-  };
+  virtual Result<compute::Expression> Parse(const std::string& directory = "",
+                                            const std::string& prefix = "") 
const = 0;

Review Comment:
   Though yes, it would be better if we could pass `const PartitionPathFormat&` 
instead. FWIW I don't think we have to expose it to Python. Or we can just make 
a namedtuple on the Python side, it doesn't have to be a C++ class wrapper.



##########
cpp/src/arrow/dataset/partition.cc:
##########
@@ -805,19 +806,20 @@ std::shared_ptr<PartitioningFactory> 
HivePartitioning::MakeFactory(
   return std::shared_ptr<PartitioningFactory>(new 
HivePartitioningFactory(options));
 }
 
-std::string StripPrefixAndFilename(const std::string& path, const std::string& 
prefix) {
+PartitionPathFormat StripPrefixAndFilename(const std::string& path,
+                                           const std::string& prefix) {
   auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path);
   auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path;
   auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
-  return basename_filename.first;
+  return PartitionPathFormat{basename_filename.first, 
basename_filename.second};

Review Comment:
   nit: `std::move` things here



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -873,7 +874,8 @@ Result<std::vector<std::shared_ptr<Schema>>> 
ParquetDatasetFactory::InspectSchem
 
     size_t i = 0;
     for (const auto& e : paths_with_row_group_ids_) {
-      stripped[i++] = StripPrefixAndFilename(e.first, 
options_.partition_base_dir);
+      stripped[i++] =
+          StripPrefixAndFilename(e.first, 
options_.partition_base_dir).directory;

Review Comment:
   (Can we cover this with a test?)



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -847,7 +847,8 @@ ParquetDatasetFactory::CollectParquetFragments(const 
Partitioning& partitioning)
     auto row_groups = Iota(metadata_subset->num_row_groups());
 
     auto partition_expression =
-        partitioning.Parse(StripPrefixAndFilename(path, 
options_.partition_base_dir))
+        partitioning
+            .Parse(StripPrefixAndFilename(path, 
options_.partition_base_dir).directory)

Review Comment:
   (Also, can we cover this with a test?)



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -873,7 +874,8 @@ Result<std::vector<std::shared_ptr<Schema>>> 
ParquetDatasetFactory::InspectSchem
 
     size_t i = 0;
     for (const auto& e : paths_with_row_group_ids_) {
-      stripped[i++] = StripPrefixAndFilename(e.first, 
options_.partition_base_dir);
+      stripped[i++] =
+          StripPrefixAndFilename(e.first, 
options_.partition_base_dir).directory;

Review Comment:
   Hmm, don't we still want the filename in the path in case the partitioning 
factory is a filename PartitioningFactory?



##########
cpp/src/arrow/dataset/discovery.cc:
##########
@@ -278,9 +278,11 @@ Result<std::shared_ptr<Dataset>> 
FileSystemDatasetFactory::Finish(FinishOptions
   }
 
   std::vector<std::shared_ptr<FileFragment>> fragments;
+  std::string fixed_path;

Review Comment:
   Is this necessary? It gets shadowed below



##########
cpp/src/arrow/dataset/partition.h:
##########
@@ -353,16 +359,17 @@ class ARROW_DS_EXPORT FilenamePartitioning : public 
KeyValuePartitioning {
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
  private:
-  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+  Result<std::vector<Key>> ParseKeys(const std::string& directory,
+                                     const std::string& prefix) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const 
override;
 };
 
 /// \brief Remove a prefix and the filename of a path.
 ///
 /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> 
"year=2019"`

Review Comment:
   Example in docstring needs updating



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to