This is an automated email from the ASF dual-hosted git repository.
weibin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-graphar.git
The following commit(s) were added to refs/heads/main by this push:
new 8be738e feat(c++): Use simple-uri-parser as uri parser, remove the
rely on arrow::internal::URI (#460)
8be738e is described below
commit 8be738e298a016ef2d5837852cddc0afbc8c9eeb
Author: Weibin Zeng <[email protected]>
AuthorDate: Tue Apr 23 13:42:57 2024 +0800
feat(c++): Use simple-uri-parser as uri parser, remove the rely on
arrow::internal::URI (#460)
---
NOTICE | 4 +
cpp/CMakeLists.txt | 4 +-
cpp/src/filesystem.cc | 22 +--
cpp/thirdparty/simple-uri-parser/uri_parser.h | 241 ++++++++++++++++++++++++++
licenses/LICENSE-simple-uri-parser | 21 +++
5 files changed, 278 insertions(+), 14 deletions(-)
diff --git a/NOTICE b/NOTICE
index 9f53e73..e5a8308 100644
--- a/NOTICE
+++ b/NOTICE
@@ -74,6 +74,10 @@ The text of each license is also included in
licenses/LICENSE-[project].txt
cpp/thirdparty/mini-yam/Yaml.hpp
cpp/thirdparty/mini-yaml/Yaml.cpp
+* simple-uri-parser (https://github.com/jholloc/simple-uri-parser)
+ Files:
+ cpp/thirdparty/simple-uri-parser/uri_parser.h
+
================================================================
BSD 3-Clause licenses
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3046ffb..5a6fc007 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -233,7 +233,9 @@ macro(build_gar)
target_compile_features(gar PRIVATE cxx_std_17)
target_include_directories(gar PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
-
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/mini-yaml>
+ )
+ target_include_directories(gar PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/mini-yaml
+
${CMAKE_CURRENT_SOURCE_DIR}/thirdparty
)
if(BUILD_ARROW_FROM_SOURCE)
target_include_directories(gar SYSTEM BEFORE PRIVATE
${GAR_ARROW_INCLUDE_DIR})
diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc
index bc76e73..e0a8a4a 100644
--- a/cpp/src/filesystem.cc
+++ b/cpp/src/filesystem.cc
@@ -25,6 +25,7 @@
#include "arrow/filesystem/s3fs.h"
#include "arrow/ipc/writer.h"
#include "parquet/arrow/writer.h"
+#include "simple-uri-parser/uri_parser.h"
#include "gar/fwd.h"
#include "gar/util/expression.h"
@@ -78,12 +79,6 @@ static Status CastToLargeOffsetArray(
GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(out, arrow::ChunkedArray::Make(chunks));
return Status::OK();
}
-
-Result<arrow::internal::Uri> ParseFileSystemUri(const std::string& uri_string)
{
- arrow::internal::Uri uri;
- RETURN_NOT_ARROW_OK(uri.Parse(uri_string));
- return std::move(uri);
-}
} // namespace detail
std::shared_ptr<ds::FileFormat> FileSystem::GetFileFormat(
@@ -291,15 +286,16 @@ Result<std::shared_ptr<FileSystem>>
FileSystemFromUriOrPath(
GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(
auto arrow_fs, arrow::fs::FileSystemFromUriOrPath(uri_string));
- GAR_ASSIGN_OR_RAISE(auto uri, detail::ParseFileSystemUri(uri_string));
+ auto uri = uri::parse_uri(uri_string);
+ if (uri.error != uri::Error::None) {
+ return Status::Invalid("Failed to parse URI: ", uri_string);
+ }
if (out_path != nullptr) {
- if (uri.scheme() == "file" || uri.scheme() == "hdfs" ||
- uri.scheme().empty()) {
- *out_path = uri.path();
- } else if (uri.scheme() == "s3" || uri.scheme() == "gs") {
+ if (uri.scheme == "file" || uri.scheme == "hdfs" || uri.scheme.empty()) {
+ *out_path = uri.path;
+ } else if (uri.scheme == "s3" || uri.scheme == "gs") {
// bucket name is the host, path is the path
- // the arrow parser would delete the trailing slash which we don't want
to
- *out_path = uri.host() + uri.path();
+ *out_path = uri.authority.host + uri.path;
} else {
return Status::Invalid("Unrecognized filesystem type in URI: ",
uri_string);
diff --git a/cpp/thirdparty/simple-uri-parser/uri_parser.h
b/cpp/thirdparty/simple-uri-parser/uri_parser.h
new file mode 100644
index 0000000..c90fe84
--- /dev/null
+++ b/cpp/thirdparty/simple-uri-parser/uri_parser.h
@@ -0,0 +1,241 @@
+/**
+* MIT License
+*
+* Copyright (c) 2021 Jonathan Hollocombe
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in
all
+* copies or substantial portions of the Software.
+
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#pragma once
+
+#ifndef SIMPLE_URI_PARSER_LIBRARY_H
+#define SIMPLE_URI_PARSER_LIBRARY_H
+
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+
+#ifndef simple_uri_CPLUSPLUS
+# if defined(_MSVC_LANG ) && !defined(__clang__)
+# define simple_uri_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG )
+# else
+# define simple_uri_CPLUSPLUS __cplusplus
+# endif
+#endif
+
+#define simple_uri_CPP17_OR_GREATER ( simple_uri_CPLUSPLUS >= 201703L )
+
+namespace uri {
+
+#if simple_uri_CPP17_OR_GREATER
+ using string_view_type = std::string_view;
+ using string_arg_type = std::string_view;
+ constexpr auto npos = std::string_view::npos;
+#else
+ using string_view_type = std::string;
+ using string_arg_type = const std::string&;
+ constexpr auto npos = std::string::npos;
+#endif
+
+using query_type = std::unordered_map<std::string, std::string>;
+
+enum class Error {
+ None,
+ InvalidScheme,
+ InvalidPort,
+};
+
+struct Authority {
+ std::string authority;
+ std::string userinfo;
+ std::string host;
+ long port = 0;
+};
+
+struct Uri {
+ Error error;
+ std::string scheme;
+ Authority authority = {};
+ std::string path;
+ query_type query = {};
+ std::string query_string;
+ std::string fragment;
+
+ explicit Uri(Error error) : error(error) {}
+ Uri(std::string scheme, Authority authority, std::string path, query_type
query, std::string query_string, std::string fragment)
+ : error(Error::None)
+ , scheme(std::move(scheme))
+ , authority(std::move(authority))
+ , path(std::move(path))
+ , query(std::move(query))
+ , query_string(std::move(query_string))
+ , fragment(std::move(fragment))
+ {}
+};
+
+}
+
+namespace {
+
+bool valid_scheme(uri::string_arg_type scheme) {
+ if (scheme.empty()) {
+ return false;
+ }
+ auto pos = std::find_if_not(scheme.begin(), scheme.end(), [&](char c){
+ return std::isalnum(c) || c == '+' || c == '.' || c == '-';
+ });
+ return pos == scheme.end();
+}
+
+std::tuple<std::string, uri::Error, uri::string_view_type>
parse_scheme(uri::string_arg_type uri) {
+ auto pos = uri.find(':');
+ if (pos == uri::npos) {
+ return { "", uri::Error::InvalidScheme, uri };
+ }
+
+ auto scheme = uri.substr(0, pos);
+ if (!::valid_scheme(scheme)) {
+ return { "", uri::Error::InvalidScheme, uri };
+ }
+ std::string scheme_string{ scheme };
+ std::transform(scheme_string.begin(), scheme_string.end(),
scheme_string.begin(),
+ [](unsigned char c){ return std::tolower(c); });
+
+ return { scheme_string, uri::Error::None, uri.substr(pos + 1) };
+}
+
+std::tuple<uri::Authority, uri::Error, uri::string_view_type>
parse_authority(uri::string_arg_type uri) {
+ uri::Authority authority;
+
+ bool has_authority = uri.length() >= 2 && uri[0] == '/' && uri[1] == '/';
+ if (!has_authority) {
+ return { authority, uri::Error::None, uri };
+ }
+
+ auto pos = uri.substr(2).find('/');
+ auto auth_string = uri.substr(2, pos);
+ auto rem = uri.substr(pos + 2);
+ authority.authority = auth_string;
+
+ pos = auth_string.find('@');
+ if (pos != uri::npos) {
+ authority.userinfo = std::string(auth_string.substr(0, pos));
+ auth_string = auth_string.substr(pos + 1);
+ }
+
+ char* end_ptr = nullptr;
+ if (!auth_string.empty() && auth_string[0] != '[') {
+ pos = auth_string.find(':');
+ if (pos != uri::npos) {
+ authority.port = std::strtol(&auth_string[pos + 1], &end_ptr, 10);
+ if (end_ptr != &*auth_string.end()) {
+ return { authority, uri::Error::InvalidPort, auth_string };
+ }
+ }
+ }
+
+ authority.host = auth_string.substr(0, pos);
+
+ return { authority, uri::Error::None, rem };
+}
+
+std::tuple<std::string, uri::Error, uri::string_view_type>
parse_path(uri::string_arg_type uri) {
+ auto pos = uri.find_first_of("#?");
+ if (pos == uri::npos) {
+ auto path = std::string(uri);
+ return { path, uri::Error::None, "" };
+ } else {
+ auto path = std::string(uri.substr(0, pos));
+ return { path, uri::Error::None, uri.substr(pos + 1) };
+ }
+}
+
+std::tuple<uri::query_type, std::string, uri::Error, uri::string_view_type>
parse_query(uri::string_arg_type uri) {
+ auto hash_pos = uri.find('#');
+ auto query_substring = uri.substr(0, hash_pos);
+ auto query_string = std::string(query_substring);
+ uri::query_type query;
+ while (!query_substring.empty()) {
+ auto delim_pos = query_substring.find_first_of("&;?", 0);
+ auto arg = query_substring.substr(0, delim_pos);
+ auto equals_pos = arg.find('=');
+ if (equals_pos == uri::npos) {
+ query[std::string(arg)] = "";
+ } else {
+ query[std::string(arg.substr(0, equals_pos))] =
arg.substr(equals_pos + 1);
+ }
+ if (delim_pos == uri::npos) {
+ query_substring = "";
+ } else {
+ query_substring = query_substring.substr(delim_pos + 1);
+ }
+ }
+
+ return {query, query_string, uri::Error::None, uri.substr(hash_pos + 1) };
+}
+
+std::tuple<std::string, uri::Error, uri::string_view_type>
parse_fragment(uri::string_arg_type uri) {
+ return { std::string(uri), uri::Error::None, uri };
+}
+
+} // anon namespace
+
+namespace uri {
+
+inline Uri parse_uri(uri::string_arg_type uri_in) {
+ Error error;
+
+ string_view_type uri;
+ std::string scheme;
+ std::tie(scheme, error, uri) = ::parse_scheme(uri_in);
+ if (error != Error::None) {
+ return Uri(error);
+ }
+
+ Authority authority;
+ std::tie(authority, error, uri) = ::parse_authority(uri);
+ if (error != Error::None) {
+ return Uri(error);
+ }
+
+ std::string path;
+ std::tie(path, error, uri) = ::parse_path(uri);
+ if (error != Error::None) {
+ return Uri(error);
+ }
+
+ query_type query;
+ std::string query_string;
+ std::tie(query, query_string, error, uri) = ::parse_query(uri);
+ if (error != Error::None) {
+ return Uri(error);
+ }
+
+ std::string fragment;
+ std::tie(fragment, error, uri) = ::parse_fragment(uri);
+ if (error != Error::None) {
+ return Uri(error);
+ }
+
+ return Uri(scheme, authority, path, query, query_string, fragment);
+}
+
+} // namespace uri
+
+#endif // SIMPLE_URI_PARSER_LIBRARY_H
diff --git a/licenses/LICENSE-simple-uri-parser
b/licenses/LICENSE-simple-uri-parser
new file mode 100644
index 0000000..5c6b9f0
--- /dev/null
+++ b/licenses/LICENSE-simple-uri-parser
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Jonathan Hollocombe
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]