This is an automated email from the ASF dual-hosted git repository.

weibin pushed a commit to branch v0.11.3-patch
in repository https://gitbox.apache.org/repos/asf/incubator-graphar.git

commit f35fe59e70211ff26341ba2c1b0bd57c7eb66b34
Author: Weibin Zeng <[email protected]>
AuthorDate: Tue Apr 23 13:42:57 2024 +0800

    feat(c++):  Use simple-uri-parser as uri parser, remove the rely on 
arrow::internal::URI (#460)
---
 cpp/CMakeLists.txt                            |   4 +-
 cpp/src/filesystem.cc                         |  22 +--
 cpp/thirdparty/simple-uri-parser/uri_parser.h | 241 ++++++++++++++++++++++++++
 licenses/LICENSE-simple-uri-parser            |  21 +++
 4 files changed, 274 insertions(+), 14 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7116b6f7..e8352f69 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -215,7 +215,9 @@ macro(build_gar)
     target_compile_features(gar PRIVATE cxx_std_17)
     target_include_directories(gar PUBLIC 
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
                                           
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
-                                          
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/mini-yaml>
+    )
+    target_include_directories(gar PRIVATE 
${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/mini-yaml
+                                           
${CMAKE_CURRENT_SOURCE_DIR}/thirdparty
     )
     if(BUILD_ARROW_FROM_SOURCE)
         target_include_directories(gar SYSTEM BEFORE PRIVATE 
${GAR_ARROW_INCLUDE_DIR})
diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc
index b5528f1a..85132d45 100644
--- a/cpp/src/filesystem.cc
+++ b/cpp/src/filesystem.cc
@@ -22,6 +22,7 @@
 #include "arrow/filesystem/s3fs.h"
 #include "arrow/ipc/writer.h"
 #include "parquet/arrow/writer.h"
+#include "simple-uri-parser/uri_parser.h"
 
 #include "gar/fwd.h"
 #include "gar/util/expression.h"
@@ -75,12 +76,6 @@ static Status CastToLargeOffsetArray(
   GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(out, arrow::ChunkedArray::Make(chunks));
   return Status::OK();
 }
-
-Result<arrow::internal::Uri> ParseFileSystemUri(const std::string& uri_string) 
{
-  arrow::internal::Uri uri;
-  RETURN_NOT_ARROW_OK(uri.Parse(uri_string));
-  return std::move(uri);
-}
 }  // namespace detail
 
 std::shared_ptr<ds::FileFormat> FileSystem::GetFileFormat(
@@ -286,15 +281,16 @@ Result<std::shared_ptr<FileSystem>> 
FileSystemFromUriOrPath(
 
   GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(
       auto arrow_fs, arrow::fs::FileSystemFromUriOrPath(uri_string));
-  GAR_ASSIGN_OR_RAISE(auto uri, detail::ParseFileSystemUri(uri_string));
+  auto uri = uri::parse_uri(uri_string);
+  if (uri.error != uri::Error::None) {
+    return Status::Invalid("Failed to parse URI: ", uri_string);
+  }
   if (out_path != nullptr) {
-    if (uri.scheme() == "file" || uri.scheme() == "hdfs" ||
-        uri.scheme().empty()) {
-      *out_path = uri.path();
-    } else if (uri.scheme() == "s3" || uri.scheme() == "gs") {
+    if (uri.scheme == "file" || uri.scheme == "hdfs" || uri.scheme.empty()) {
+      *out_path = uri.path;
+    } else if (uri.scheme == "s3" || uri.scheme == "gs") {
       // bucket name is the host, path is the path
-      // the arrow parser would delete the trailing slash which we don't want 
to
-      *out_path = uri.host() + uri.path();
+      *out_path = uri.authority.host + uri.path;
     } else {
       return Status::Invalid("Unrecognized filesystem type in URI: ",
                              uri_string);
diff --git a/cpp/thirdparty/simple-uri-parser/uri_parser.h 
b/cpp/thirdparty/simple-uri-parser/uri_parser.h
new file mode 100644
index 00000000..c90fe84b
--- /dev/null
+++ b/cpp/thirdparty/simple-uri-parser/uri_parser.h
@@ -0,0 +1,241 @@
+/**
+* MIT License
+*
+* Copyright (c) 2021 Jonathan Hollocombe
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in 
all
+* copies or substantial portions of the Software.
+
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#pragma once
+
+#ifndef SIMPLE_URI_PARSER_LIBRARY_H
+#define SIMPLE_URI_PARSER_LIBRARY_H
+
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+
+#ifndef simple_uri_CPLUSPLUS
+# if defined(_MSVC_LANG ) && !defined(__clang__)
+#  define simple_uri_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG )
+# else
+#  define simple_uri_CPLUSPLUS __cplusplus
+# endif
+#endif
+
+#define simple_uri_CPP17_OR_GREATER  ( simple_uri_CPLUSPLUS >= 201703L )
+
+namespace uri {
+
+#if simple_uri_CPP17_OR_GREATER
+  using string_view_type = std::string_view;
+  using string_arg_type = std::string_view;
+  constexpr auto npos = std::string_view::npos;
+#else
+  using string_view_type = std::string;
+  using string_arg_type = const std::string&;
+  constexpr auto npos = std::string::npos;
+#endif
+
+using query_type = std::unordered_map<std::string, std::string>;
+
+enum class Error {
+    None,
+    InvalidScheme,
+    InvalidPort,
+};
+
+struct Authority {
+    std::string authority;
+    std::string userinfo;
+    std::string host;
+    long port = 0;
+};
+
+struct Uri {
+    Error error;
+    std::string scheme;
+    Authority authority = {};
+    std::string path;
+    query_type query = {};
+    std::string query_string;
+    std::string fragment;
+
+    explicit Uri(Error error) : error(error) {}
+    Uri(std::string scheme, Authority authority, std::string path, query_type 
query, std::string query_string, std::string fragment)
+        : error(Error::None)
+        , scheme(std::move(scheme))
+        , authority(std::move(authority))
+        , path(std::move(path))
+        , query(std::move(query))
+        , query_string(std::move(query_string))
+        , fragment(std::move(fragment))
+        {}
+};
+
+}
+
+namespace {
+
+bool valid_scheme(uri::string_arg_type scheme) {
+    if (scheme.empty()) {
+        return false;
+    }
+    auto pos = std::find_if_not(scheme.begin(), scheme.end(), [&](char c){
+        return std::isalnum(c) || c == '+' || c == '.' || c == '-';
+    });
+    return pos == scheme.end();
+}
+
+std::tuple<std::string, uri::Error, uri::string_view_type> 
parse_scheme(uri::string_arg_type uri) {
+    auto pos = uri.find(':');
+    if (pos == uri::npos) {
+        return { "", uri::Error::InvalidScheme, uri };
+    }
+
+    auto scheme = uri.substr(0, pos);
+    if (!::valid_scheme(scheme)) {
+        return { "", uri::Error::InvalidScheme, uri };
+    }
+    std::string scheme_string{ scheme };
+    std::transform(scheme_string.begin(), scheme_string.end(), 
scheme_string.begin(),
+                   [](unsigned char c){ return std::tolower(c); });
+
+    return { scheme_string, uri::Error::None, uri.substr(pos + 1) };
+}
+
+std::tuple<uri::Authority, uri::Error, uri::string_view_type> 
parse_authority(uri::string_arg_type uri) {
+    uri::Authority authority;
+
+    bool has_authority = uri.length() >= 2 && uri[0] == '/' && uri[1] == '/';
+    if (!has_authority) {
+        return { authority, uri::Error::None, uri };
+    }
+
+    auto pos = uri.substr(2).find('/');
+    auto auth_string = uri.substr(2, pos);
+    auto rem = uri.substr(pos + 2);
+    authority.authority = auth_string;
+
+    pos = auth_string.find('@');
+    if (pos != uri::npos) {
+        authority.userinfo = std::string(auth_string.substr(0, pos));
+        auth_string = auth_string.substr(pos + 1);
+    }
+
+    char* end_ptr = nullptr;
+    if (!auth_string.empty() && auth_string[0] != '[') {
+        pos = auth_string.find(':');
+        if (pos != uri::npos) {
+            authority.port = std::strtol(&auth_string[pos + 1], &end_ptr, 10);
+            if (end_ptr != &*auth_string.end()) {
+                return { authority, uri::Error::InvalidPort, auth_string };
+            }
+        }
+    }
+
+    authority.host = auth_string.substr(0, pos);
+
+    return { authority, uri::Error::None, rem };
+}
+
+std::tuple<std::string, uri::Error, uri::string_view_type> 
parse_path(uri::string_arg_type uri) {
+    auto pos = uri.find_first_of("#?");
+    if (pos == uri::npos) {
+        auto path = std::string(uri);
+        return { path, uri::Error::None, "" };
+    } else {
+        auto path = std::string(uri.substr(0, pos));
+        return { path, uri::Error::None, uri.substr(pos + 1) };
+    }
+}
+
+std::tuple<uri::query_type, std::string, uri::Error, uri::string_view_type> 
parse_query(uri::string_arg_type uri) {
+    auto hash_pos = uri.find('#');
+    auto query_substring = uri.substr(0, hash_pos);
+    auto query_string = std::string(query_substring);
+    uri::query_type query;
+    while (!query_substring.empty()) {
+        auto delim_pos = query_substring.find_first_of("&;?", 0);
+        auto arg = query_substring.substr(0, delim_pos);
+        auto equals_pos = arg.find('=');
+        if (equals_pos == uri::npos) {
+            query[std::string(arg)] = "";
+        } else {
+            query[std::string(arg.substr(0, equals_pos))] = 
arg.substr(equals_pos + 1);
+        }
+        if (delim_pos == uri::npos) {
+            query_substring = "";
+        } else {
+            query_substring = query_substring.substr(delim_pos + 1);
+        }
+    }
+
+    return {query, query_string, uri::Error::None, uri.substr(hash_pos + 1) };
+}
+
+std::tuple<std::string, uri::Error, uri::string_view_type> 
parse_fragment(uri::string_arg_type uri) {
+    return { std::string(uri), uri::Error::None, uri };
+}
+
+} // anon namespace
+
+namespace uri {
+
+inline Uri parse_uri(uri::string_arg_type uri_in) {
+    Error error;
+
+    string_view_type uri;
+    std::string scheme;
+    std::tie(scheme, error, uri) = ::parse_scheme(uri_in);
+    if (error != Error::None) {
+        return Uri(error);
+    }
+
+    Authority authority;
+    std::tie(authority, error, uri) = ::parse_authority(uri);
+    if (error != Error::None) {
+        return Uri(error);
+    }
+
+    std::string path;
+    std::tie(path, error, uri) = ::parse_path(uri);
+    if (error != Error::None) {
+        return Uri(error);
+    }
+
+    query_type query;
+    std::string query_string;
+    std::tie(query, query_string, error, uri) = ::parse_query(uri);
+    if (error != Error::None) {
+        return Uri(error);
+    }
+
+    std::string fragment;
+    std::tie(fragment, error, uri) = ::parse_fragment(uri);
+    if (error != Error::None) {
+        return Uri(error);
+    }
+
+    return Uri(scheme, authority, path, query, query_string, fragment);
+}
+
+} // namespace uri
+
+#endif // SIMPLE_URI_PARSER_LIBRARY_H
diff --git a/licenses/LICENSE-simple-uri-parser 
b/licenses/LICENSE-simple-uri-parser
new file mode 100644
index 00000000..5c6b9f0b
--- /dev/null
+++ b/licenses/LICENSE-simple-uri-parser
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Jonathan Hollocombe
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to