lidavidm commented on a change in pull request #10264:
URL: https://github.com/apache/arrow/pull/10264#discussion_r643379606
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -259,15 +274,29 @@ Result<std::string> KeyValuePartitioning::Format(const
compute::Expression& expr
return FormatValues(values);
}
-std::vector<KeyValuePartitioning::Key> DirectoryPartitioning::ParseKeys(
+DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr<Schema> schema,
+ ArrayVector dictionaries,
+ KeyValuePartitioningOptions
options)
+ : KeyValuePartitioning(std::move(schema), std::move(dictionaries),
options) {
+ if (options.url_decode_segments) {
+ util::InitializeUTF8();
+ }
+}
+
+Result<std::vector<KeyValuePartitioning::Key>>
DirectoryPartitioning::ParseKeys(
const std::string& path) const {
std::vector<Key> keys;
int i = 0;
for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
if (i >= schema_->num_fields()) break;
- keys.push_back({schema_->field(i++)->name(), std::move(segment)});
+ if (options_.url_decode_segments) {
+ ARROW_ASSIGN_OR_RAISE(auto decoded, SafeUriUnescape(segment));
+ keys.push_back({schema_->field(i++)->name(), std::move(decoded)});
+ } else {
+ keys.push_back({schema_->field(i++)->name(), std::move(segment)});
Review comment:
Just because we didn't before, but perhaps we should.
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -481,28 +533,39 @@ std::shared_ptr<PartitioningFactory>
DirectoryPartitioning::MakeFactory(
new DirectoryPartitioningFactory(std::move(field_names), options));
}
-util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
- const std::string& segment, const std::string& null_fallback) {
+Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
+ const std::string& segment, const HivePartitioningOptions& options) {
auto name_end = string_view(segment).find_first_of('=');
// Not round-trippable
if (name_end == string_view::npos) {
return util::nullopt;
}
auto name = segment.substr(0, name_end);
- auto value = segment.substr(name_end + 1);
- if (value == null_fallback) {
- return Key{name, util::nullopt};
+ std::string value;
+ if (options.url_decode_segments) {
+ // Static method, so we have no better place for it
+ util::InitializeUTF8();
+ auto raw_value =
+ util::string_view(segment.data() + name_end + 1, segment.size() -
name_end - 1);
Review comment:
Just to avoid a copy, but this can be done as
`string_view(segment).substr(...)` instead.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]