This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 25d124e85dc branch-4.0: [feat](tvf) support huggingface with http tvf
#58049 #57242 (#58527)
25d124e85dc is described below
commit 25d124e85dcd00af10cf55f52239c15e9c7f93e5
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Tue Dec 2 21:06:50 2025 +0800
branch-4.0: [feat](tvf) support huggingface with http tvf #58049 #57242
(#58527)
bp #58049 #57242
---------
Co-authored-by: 神技圈子 <[email protected]>
---
be/src/http/http_client.h | 7 +
be/src/io/file_factory.cpp | 17 +
be/src/io/fs/file_system.h | 10 +-
be/src/io/fs/http_file_reader.cpp | 422 ++++++++++++
be/src/io/fs/http_file_reader.h | 87 +++
be/src/io/fs/http_file_system.cpp | 88 +++
be/src/io/fs/http_file_system.h | 104 +++
be/src/vec/exec/format/json/new_json_reader.cpp | 1 +
.../org/apache/doris/analysis/StorageBackend.java | 3 +-
.../doris/catalog/BuiltinTableValuedFunctions.java | 4 +-
.../apache/doris/datasource/FileQueryScanNode.java | 3 +-
.../property/constants/MCProperties.java | 2 -
.../property/storage/HttpProperties.java | 92 +++
.../property/storage/StorageProperties.java | 9 +-
.../java/org/apache/doris/fs/FileSystemType.java | 3 +-
.../java/org/apache/doris/fs/SchemaTypeMapper.java | 5 +-
.../doris/httpv2/rest/manager/HttpUtils.java | 69 ++
.../trees/expressions/functions/table/Http.java | 57 ++
.../visitor/TableValuedFunctionVisitor.java | 5 +
.../tablefunction/FileTableValuedFunction.java | 3 +
.../org/apache/doris/tablefunction/HFUtils.java | 755 +++++++++++++++++++++
.../tablefunction/HttpTableValuedFunction.java | 106 +++
.../doris/tablefunction/TableValuedFunctionIf.java | 2 +
.../apache/doris/tablefunction/HFUtilsTest.java | 416 ++++++++++++
gensrc/thrift/Types.thrift | 1 +
.../data/external_table_p0/tvf/test_http_tvf.out | 176 +++++
.../external_table_p0/tvf/test_http_tvf.groovy | 248 +++++++
27 files changed, 2680 insertions(+), 15 deletions(-)
diff --git a/be/src/http/http_client.h b/be/src/http/http_client.h
index 7447ff9a751..b6c5a5a07d0 100644
--- a/be/src/http/http_client.h
+++ b/be/src/http/http_client.h
@@ -181,6 +181,13 @@ public:
// https://datatracker.ietf.org/doc/html/rfc3986
Status _escape_url(const std::string& url, std::string* escaped_url);
+ void set_range(size_t offset, size_t length) {
+ std::string range_header = "Range: bytes=" + std::to_string(offset) +
"-" +
+ std::to_string(offset + length - 1);
+ _header_list = curl_slist_append(_header_list, range_header.c_str());
+ curl_easy_setopt(_curl, CURLOPT_HTTPHEADER, _header_list);
+ }
+
private:
const char* _to_errmsg(CURLcode code) const;
const char* _get_url() const;
diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp
index e7718399487..bd08bc20461 100644
--- a/be/src/io/file_factory.cpp
+++ b/be/src/io/file_factory.cpp
@@ -27,6 +27,7 @@
#include "common/cast_set.h"
#include "common/config.h"
#include "common/status.h"
+#include "fs/http_file_reader.h"
#include "io/fs/broker_file_system.h"
#include "io/fs/broker_file_writer.h"
#include "io/fs/file_reader.h"
@@ -35,6 +36,7 @@
#include "io/fs/hdfs_file_reader.h"
#include "io/fs/hdfs_file_system.h"
#include "io/fs/hdfs_file_writer.h"
+#include "io/fs/http_file_system.h"
#include "io/fs/local_file_system.h"
#include "io/fs/multi_table_pipe.h"
#include "io/fs/s3_file_reader.h"
@@ -122,6 +124,14 @@ Result<io::FileSystemSPtr> FileFactory::create_fs(const
io::FSPropertiesRef& fs_
return io::HdfsFileSystem::create(*fs_properties.properties, fs_name,
io::FileSystem::TMP_FS_ID, nullptr);
}
+ case TFileType::FILE_HTTP: {
+ const auto& kv = *fs_properties.properties;
+ auto it = kv.find("uri");
+ if (it == kv.end() || it->second.empty()) {
+ return ResultError(Status::InternalError("http fs must set uri
property"));
+ }
+ return io::HttpFileSystem::create(it->second,
io::FileSystem::TMP_FS_ID, kv);
+ }
default:
return ResultError(Status::InternalError("unsupported fs type: {}",
std::to_string(fs_properties.type)));
@@ -248,6 +258,13 @@ Result<io::FileReaderSPtr> FileFactory::create_file_reader(
return file_reader;
});
}
+ case TFileType::FILE_HTTP: {
+ return io::HttpFileReader::create(file_description.path,
system_properties.properties,
+ reader_options, profile)
+ .and_then([&](auto&& reader) {
+ return io::create_cached_file_reader(std::move(reader),
reader_options);
+ });
+ }
default:
return ResultError(
Status::InternalError("unsupported file reader type: {}",
std::to_string(type)));
diff --git a/be/src/io/fs/file_system.h b/be/src/io/fs/file_system.h
index 6baf07917d3..2cf63b2ff00 100644
--- a/be/src/io/fs/file_system.h
+++ b/be/src/io/fs/file_system.h
@@ -48,12 +48,7 @@ namespace doris::io {
} while (0);
#endif
-enum class FileSystemType : uint8_t {
- LOCAL,
- S3,
- HDFS,
- BROKER,
-};
+enum class FileSystemType : uint8_t { LOCAL, S3, HDFS, BROKER, HTTP };
inline std::ostream& operator<<(std::ostream& ostr, FileSystemType type) {
switch (type) {
@@ -69,6 +64,9 @@ inline std::ostream& operator<<(std::ostream& ostr,
FileSystemType type) {
case FileSystemType::BROKER:
ostr << "BROKER";
return ostr;
+ case FileSystemType::HTTP:
+ ostr << "HTTP";
+ return ostr;
default:
ostr << "UNKNOWN";
return ostr;
diff --git a/be/src/io/fs/http_file_reader.cpp
b/be/src/io/fs/http_file_reader.cpp
new file mode 100644
index 00000000000..fb243179baf
--- /dev/null
+++ b/be/src/io/fs/http_file_reader.cpp
@@ -0,0 +1,422 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "io/fs/http_file_reader.h"
+
+#include <curl/curl.h>
+#include <curl/easy.h>
+
+#include <algorithm>
+
+#include "common/logging.h"
+
+namespace doris::io {
+
+Result<FileReaderSPtr> HttpFileReader::create(const std::string& url,
+ const std::map<std::string,
std::string>& props,
+ const FileReaderOptions& opts,
+ RuntimeProfile* /*profile*/) {
+ OpenFileInfo ofi;
+ ofi.path = Path(url);
+ ofi.extend_info = props;
+
+ auto reader = std::make_shared<HttpFileReader>(ofi, url);
+
+ // Open the file to detect Range support and validate configuration
+ RETURN_IF_ERROR_RESULT(reader->open(opts));
+
+ return reader;
+}
+
+HttpFileReader::HttpFileReader(const OpenFileInfo& fileInfo, std::string url)
+ : _extend_kv(fileInfo.extend_info),
+ _path(fileInfo.path),
+ _url(std::move(url)),
+ _client(std::make_unique<HttpClient>()) {
+ auto etag_iter = _extend_kv.find("etag");
+ if (etag_iter != _extend_kv.end()) {
+ _etag = etag_iter->second;
+ }
+
+ auto lm_iter = _extend_kv.find("last_modified");
+ if (lm_iter != _extend_kv.end()) {
+ _last_modified = std::stoll(lm_iter->second);
+ }
+
+ auto size_iter = _extend_kv.find("file_size");
+ if (size_iter != _extend_kv.end()) {
+ _file_size = std::stoull(size_iter->second);
+ _initialized = true;
+ }
+
+ // Parse configuration for non-Range request handling
+ auto enable_range_iter = _extend_kv.find("http.enable.range.request");
+ if (enable_range_iter != _extend_kv.end()) {
+ // Convert to lowercase for case-insensitive comparison
+ std::string value = enable_range_iter->second;
+ std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+ _enable_range_request = (value != "false" && value != "0");
+ }
+
+ auto max_size_iter = _extend_kv.find("http.max.request.size.bytes");
+ if (max_size_iter != _extend_kv.end()) {
+ try {
+ _max_request_size_bytes = std::stoull(max_size_iter->second);
+ } catch (const std::exception& _) {
+ LOG(WARNING) << "Invalid http.max.request.size.bytes value: " <<
max_size_iter->second
+ << ", using default: " << DEFAULT_MAX_REQUEST_SIZE;
+ _max_request_size_bytes = DEFAULT_MAX_REQUEST_SIZE;
+ }
+ }
+
+ _read_buffer = std::make_unique<char[]>(READ_BUFFER_SIZE);
+}
+
+HttpFileReader::~HttpFileReader() {
+ static_cast<void>(close());
+}
+
+Status HttpFileReader::open(const FileReaderOptions& opts) {
+ if (_initialized) {
+ return Status::OK();
+ }
+
+ // Step 1: HEAD request to get file metadata
+ RETURN_IF_ERROR(prepare_client(/*set_fail_on_error=*/true));
+ _client->set_method(HttpMethod::HEAD);
+ RETURN_IF_ERROR(_client->execute());
+
+ uint64_t content_length = 0;
+ RETURN_IF_ERROR(_client->get_content_length(&content_length));
+
+ _file_size = content_length;
+ _size_known = true;
+
+ // Step 2: Check if Range request is disabled by configuration
+ if (!_enable_range_request) {
+ // User explicitly disabled Range requests, use non-Range mode directly
+ _range_supported = false;
+ LOG(INFO) << "Range requests disabled by configuration for " << _url
+ << ", using non-Range mode. File size: " << _file_size << "
bytes";
+
+ // Check if file size exceeds limit for non-Range mode
+ if (_file_size > _max_request_size_bytes) {
+ return Status::InternalError(
+ "Non-Range mode: file size ({} bytes) exceeds maximum
allowed size ({} bytes, "
+ "configured by http.max.request.size.bytes). URL: {}",
+ _file_size, _max_request_size_bytes, _url);
+ }
+
+ LOG(INFO) << "Non-Range mode validated for " << _url << ", file size:
" << _file_size
+ << " bytes, max allowed: " << _max_request_size_bytes << "
bytes";
+ } else {
+ // Step 3: Range request is enabled (default), detect Range support
+ VLOG(1) << "Detecting Range support for URL: " << _url;
+ RETURN_IF_ERROR(detect_range_support());
+
+ // Step 4: Validate Range support detection result
+ if (!_range_supported) {
+ // Server does not support Range and Range is required
+ return Status::NotSupported(
+ "HTTP server does not support Range requests (RFC 7233),
which is required "
+ "for reading files. File size: {} bytes, URL: {}. "
+ "To allow reading without Range support, set "
+ "'http.enable.range.request'='false' "
+ "in properties and configure 'http.max.request.size.bytes'
appropriately "
+ "(note: this may cause high memory usage for large
files).",
+ _file_size, _url);
+ }
+
+ LOG(INFO) << "HTTP server supports Range requests for " << _url;
+ }
+
+ _initialized = true;
+ return Status::OK();
+}
+
+Status HttpFileReader::read_at_impl(size_t offset, Slice result, size_t*
bytes_read,
+ const IOContext* /*io_ctx*/) {
+ VLOG(2) << "HttpFileReader::read_at_impl offset=" << offset << " size=" <<
result.size
+ << " url=" << _url << " range_supported=" << _range_supported;
+
+ if (!_read_buffer) {
+ _read_buffer = std::make_unique<char[]>(READ_BUFFER_SIZE);
+ }
+
+ size_t to_read = result.size;
+ size_t buffer_offset = 0;
+
+ if (_size_known && offset >= _file_size) {
+ *bytes_read = 0;
+ return Status::OK();
+ }
+
+ // Try to serve from buffer cache
+ if (offset >= _buffer_start && offset < _buffer_end) {
+ size_t buffer_idx = offset - _buffer_start;
+ size_t available = _buffer_end - offset;
+ size_t copy_len = std::min(available, to_read);
+
+ DCHECK(buffer_idx + copy_len <= READ_BUFFER_SIZE)
+ << "Buffer overflow: buffer_idx=" << buffer_idx << "
copy_len=" << copy_len
+ << " READ_BUFFER_SIZE=" << READ_BUFFER_SIZE;
+
+ std::memcpy(result.data, _read_buffer.get() + buffer_idx, copy_len);
+ buffer_offset += copy_len;
+ to_read -= copy_len;
+ offset += copy_len;
+
+ VLOG(2) << "Buffer cache hit: copied " << copy_len << " bytes";
+ } else {
+ // Buffer miss, invalidate cache
+ _buffer_start = 0;
+ _buffer_end = 0;
+ VLOG(2) << "Buffer cache miss";
+ }
+
+ if (to_read == 0) {
+ *bytes_read = buffer_offset;
+ return Status::OK();
+ }
+
+ size_t remaining = to_read;
+ if (_size_known) {
+ uint64_t left = (_file_size > offset) ? (_file_size - offset) : 0;
+ if (left == 0) {
+ *bytes_read = buffer_offset;
+ return Status::OK();
+ }
+ remaining = std::min<uint64_t>(to_read, left);
+ }
+ size_t req_len = (remaining > READ_BUFFER_SIZE) ? remaining :
READ_BUFFER_SIZE;
+
+ VLOG(2) << "Issuing HTTP GET request: offset=" << offset << " req_len=" <<
req_len
+ << " with_range=" << _range_supported;
+
+ // Prepare and initialize the HTTP client for GET request
+ RETURN_IF_ERROR(prepare_client(/*set_fail_on_error=*/false));
+ _client->set_method(HttpMethod::GET);
+
+ _client->set_header("Expect", "");
+ _client->set_header("Connection", "close");
+
+ bool with_range = _range_supported;
+ if (with_range) _client->set_range(offset, req_len);
+
+ std::string buf;
+ buf.reserve(req_len);
+ size_t total_received = 0;
+ bool size_limit_exceeded = false;
+
+ auto cb = [&](const void* data, size_t len) {
+ total_received += len;
+
+ // If using non-Range mode, enforce size limit to prevent OOM
+ if (!_range_supported && total_received > _max_request_size_bytes) {
+ size_limit_exceeded = true;
+ VLOG(1) << "Stopping download: received " << total_received << "
bytes, exceeds limit "
+ << _max_request_size_bytes;
+ return false; // Stop receiving - this will cause CURL to return
an error
+ }
+
+ buf.append(reinterpret_cast<const char*>(data), len);
+ return true;
+ };
+
+ Status exec_status = _client->execute(cb);
+
+ // Check if we stopped due to size limit - this is expected behavior
+ if (size_limit_exceeded) {
+ return Status::InternalError(
+ "HTTP response too large: received {} bytes, exceeds maximum
allowed size {} "
+ "bytes (configured by max.request.size.bytes). URL: {}",
+ total_received, _max_request_size_bytes, _url);
+ }
+
+ // If there's an error and it's not due to our size limit check, return it
+ RETURN_IF_ERROR(exec_status);
+
+ long http_status = _client->get_http_status();
+ VLOG(2) << "HTTP response: status=" << http_status << " received_bytes="
<< buf.size();
+
+ if (buf.empty()) {
+ *bytes_read = buffer_offset;
+ return Status::OK();
+ }
+
+ // Defensive check: if we sent Range but server returned 200 instead of 206
+ // This should rarely happen since we detect Range support in open()
+ if (with_range && offset > 0 && http_status == 200) {
+ LOG(ERROR) << "HTTP server unexpectedly does not support Range
requests for " << _url
+ << " (this should have been detected in open()). HTTP
status: " << http_status
+ << ", received: " << buf.size()
+ << " bytes. This indicates a server behavior change.";
+
+ return Status::InternalError(
+ "HTTP server does not support Range requests but this was not
detected during "
+ "file open. This may indicate the server behavior has changed.
"
+ "HTTP status: {}, received: {} bytes. URL: {}",
+ http_status, buf.size(), _url);
+ }
+
+ // Handle non-Range mode: when _range_supported is false, we download full
file
+ if (!_range_supported && offset > 0) {
+ // We're in non-Range mode and need data from middle of file
+ // The full file should have been downloaded
+ if (offset >= buf.size()) {
+ *bytes_read = buffer_offset;
+ return Status::OK();
+ }
+
+ size_t slice_len = std::min<size_t>(remaining, buf.size() - offset);
+ std::memcpy(result.data + buffer_offset, buf.data() + offset,
slice_len);
+ buffer_offset += slice_len;
+
+ size_t cached = std::min(slice_len, (size_t)READ_BUFFER_SIZE);
+ std::memcpy(_read_buffer.get(), buf.data() + offset, cached);
+ _buffer_start = offset;
+ _buffer_end = offset + cached;
+
+ *bytes_read = buffer_offset;
+ return Status::OK();
+ }
+
+ if (to_read > READ_BUFFER_SIZE) {
+ if (buf.size() > remaining) {
+ return Status::InternalError("HTTP response larger than requested
buffer");
+ }
+ std::memcpy(result.data + buffer_offset, buf.data(), buf.size());
+ buffer_offset += buf.size();
+ } else {
+ size_t cached = std::min(buf.size(), (size_t)READ_BUFFER_SIZE);
+ std::memcpy(_read_buffer.get(), buf.data(), cached);
+ _buffer_start = offset;
+ _buffer_end = offset + cached;
+
+ size_t copy_len = std::min(remaining, cached);
+ std::memcpy(result.data + buffer_offset, _read_buffer.get(), copy_len);
+ buffer_offset += copy_len;
+ }
+
+ if (!_size_known && with_range && buf.size() < req_len) {
+ _size_known = true;
+ _file_size = offset + buf.size();
+ }
+
+ *bytes_read = buffer_offset;
+ return Status::OK();
+}
+
+Status HttpFileReader::close() {
+ if (_closed.exchange(true)) {
+ return Status::OK();
+ }
+
+ // Release buffer memory (1MB)
+ _read_buffer.reset();
+ _buffer_start = 0;
+ _buffer_end = 0;
+
+ // Release HttpClient resources
+ _client.reset();
+
+ return Status::OK();
+}
+
+Status HttpFileReader::prepare_client(bool set_fail_on_error) {
+ if (!_client) {
+ return Status::InternalError("HttpClient is not initialized");
+ }
+
+ // Initialize the HTTP client with URL
+ RETURN_IF_ERROR(_client->init(_url, set_fail_on_error));
+
+ // Set custom headers from extend_kv
+ for (const auto& kv : _extend_kv) {
+ if (kv.first.rfind("http.header.", 0) == 0) {
+ _client->set_header(kv.first.substr(strlen("http.header.")),
kv.second);
+ }
+ }
+
+ return Status::OK();
+}
+
+Status HttpFileReader::detect_range_support() {
+ // Send a small Range request to test if the server supports it
+ // We request only the first byte to minimize data transfer
+ RETURN_IF_ERROR(prepare_client(/*set_fail_on_error=*/false));
+ _client->set_method(HttpMethod::GET);
+ _client->set_range(0, 1); // Request only the first byte
+
+ std::string test_buf;
+ size_t received = 0;
+ constexpr size_t MAX_TEST_SIZE = 10240; // 10KB max for test
+ bool stopped_by_limit = false;
+
+ auto cb = [&](const void* data, size_t len) {
+ received += len;
+ // Limit test data to prevent downloading too much
+ if (received > MAX_TEST_SIZE) {
+ stopped_by_limit = true;
+ VLOG(2) << "Stopping Range detection test after receiving " <<
received << " bytes";
+ return false; // This will cause CURL to stop with an error
+ }
+ test_buf.append(reinterpret_cast<const char*>(data), len);
+ return true;
+ };
+
+ Status exec_status = _client->execute(cb);
+
+ // If we stopped because of size limit, it's not a real error
+ if (!exec_status.ok() && stopped_by_limit) {
+ VLOG(1) << "Range detection stopped at size limit (expected): " <<
exec_status.to_string();
+ // Continue processing - this is expected behavior
+ } else if (!exec_status.ok()) {
+ // Real error
+ return exec_status;
+ }
+
+ long http_status = _client->get_http_status();
+
+ if (http_status == 206) {
+ // HTTP 206 Partial Content - server supports Range requests
+ _range_supported = true;
+ VLOG(1) << "Range support detected (HTTP 206) for " << _url << ",
received "
+ << test_buf.size() << " bytes";
+ } else if (http_status == 200) {
+ // HTTP 200 OK - server does not support Range requests
+ // It returned the full file (or a large portion)
+ _range_supported = false;
+ VLOG(1) << "Range not supported (HTTP 200) for " << _url << ",
received " << test_buf.size()
+ << " bytes in test";
+
+ // If we received a lot of data, it's likely the full file
+ if (test_buf.size() >= MAX_TEST_SIZE || stopped_by_limit) {
+ LOG(WARNING) << "Server returned " << received << "+ bytes for
Range test, "
+ << "indicating no Range support for " << _url;
+ }
+ } else {
+ // Unexpected status code
+ LOG(WARNING) << "Unexpected HTTP status " << http_status << " during
Range detection for "
+ << _url << ", assuming Range is not supported";
+ _range_supported = false;
+ }
+
+ return Status::OK();
+}
+
+} // namespace doris::io
diff --git a/be/src/io/fs/http_file_reader.h b/be/src/io/fs/http_file_reader.h
new file mode 100644
index 00000000000..607eedf3d1a
--- /dev/null
+++ b/be/src/io/fs/http_file_reader.h
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "common/status.h"
+#include "http/http_client.h"
+#include "io/fs/file_handle_cache.h"
+#include "io/fs/file_reader.h"
+#include "io/fs/file_system.h"
+#include "util/runtime_profile.h"
+#include "util/slice.h"
+
+namespace doris::io {
+typedef struct OpenFileInfo {
+ Path path;
+ std::map<std::string, std::string> extend_info;
+} OpenFileInfo;
+class HttpFileReader final : public FileReader {
+public:
+ static Result<FileReaderSPtr> create(const std::string& url,
+ const std::map<std::string,
std::string>& props,
+ const FileReaderOptions& opts,
RuntimeProfile* profile);
+
+ explicit HttpFileReader(const OpenFileInfo& fileInfo, std::string url);
+ ~HttpFileReader() override;
+
+ Status open(const FileReaderOptions& opts);
+ Status read_at_impl(size_t offset, Slice result, size_t* bytes_read,
+ const IOContext* io_ctx = nullptr) override;
+ Status close() override;
+ const Path& path() const override { return _path; }
+ bool closed() const override { return
_closed.load(std::memory_order_acquire); }
+ size_t size() const override { return _file_size; }
+
+private:
+ // Prepare and initialize the HTTP client for a new request
+ Status prepare_client(bool set_fail_on_error = true);
+
+ // Detect if the HTTP server supports Range requests
+ // Returns OK on success with _range_supported set appropriately
+ Status detect_range_support();
+
+ std::unique_ptr<char[]> _read_buffer;
+ static constexpr size_t READ_BUFFER_SIZE = 1 << 20; // 1MB
+ // Default maximum file size for servers that don't support Range requests
+ static constexpr size_t DEFAULT_MAX_REQUEST_SIZE = 100 << 20; // 100MB
+
+ size_t _buffer_start = 0;
+ size_t _buffer_end = 0;
+ bool _size_known = false;
+ bool _range_supported = true;
+ std::string _etag;
+ bool _initialized = false;
+ std::map<std::string, std::string> _extend_kv;
+ size_t _file_size = static_cast<size_t>(-1);
+ Path _path;
+ std::string _url;
+ int64_t _last_modified = 0;
+ std::atomic<bool> _closed = false;
+ std::unique_ptr<HttpClient> _client;
+
+ // Configuration for non-Range request handling
+ bool _enable_range_request = true; // Whether
Range request is required
+ size_t _max_request_size_bytes = DEFAULT_MAX_REQUEST_SIZE; // Max size for
non-Range downloads
+};
+
+} // namespace doris::io
diff --git a/be/src/io/fs/http_file_system.cpp
b/be/src/io/fs/http_file_system.cpp
new file mode 100644
index 00000000000..92e175ca774
--- /dev/null
+++ b/be/src/io/fs/http_file_system.cpp
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "io/fs/http_file_system.h"
+
+#include <fstream>
+
+#include "common/status.h"
+#include "http/http_status.h"
+#include "io/fs/err_utils.h"
+#include "io/fs/file_system.h"
+#include "io/fs/file_writer.h"
+#include "io/fs/http_file_reader.h"
+
+namespace doris::io {
+HttpFileSystem::HttpFileSystem(Path&& root_path, std::string id,
+ std::map<std::string, std::string> properties)
+ : RemoteFileSystem(std::move(root_path), std::move(id),
FileSystemType::HTTP),
+ _properties(std::move(properties)) {}
+
+Status HttpFileSystem::_init(const std::string& url) {
+ _url = url;
+ return Status::OK();
+}
+
+Result<std::shared_ptr<HttpFileSystem>> HttpFileSystem::create(
+ std::string id, const std::string& url,
+ const std::map<std::string, std::string>& properties) {
+ Path root_path = "";
+ std::shared_ptr<HttpFileSystem> fs(
+ new HttpFileSystem(std::move(root_path), std::move(id),
properties));
+
+ RETURN_IF_ERROR_RESULT(fs->_init(url));
+
+ return fs;
+}
+
+Status HttpFileSystem::open_file_internal(const Path& path, FileReaderSPtr*
reader,
+ const FileReaderOptions& opts) {
+ OpenFileInfo file_info;
+ file_info.path = path;
+ // Pass properties (including HTTP headers) to the file reader
+ file_info.extend_info = _properties;
+
+ auto http_reader = std::make_shared<HttpFileReader>(file_info,
path.native());
+ RETURN_IF_ERROR(http_reader->open(opts));
+ *reader = http_reader;
+ return Status::OK();
+}
+
+Status HttpFileSystem::file_size_impl(const Path& file, int64_t* file_size)
const {
+ FileReaderOptions opts;
+ FileReaderSPtr reader;
+ RETURN_IF_ERROR(const_cast<HttpFileSystem*>(this)->open_file(file,
&reader, &opts));
+ *file_size = reader->size();
+ RETURN_IF_ERROR(reader->close());
+ return Status::OK();
+}
+
+Status HttpFileSystem::exists_impl(const Path& path, bool* res) const {
+ FileReaderSPtr reader;
+ auto st = const_cast<HttpFileSystem*>(this)->open_file(path, &reader);
+ if (st.ok()) {
+ *res = true;
+ return Status::OK();
+ } else if (st.code() == HttpStatus::NOT_FOUND) {
+ *res = false;
+ return Status::OK();
+ } else {
+ return st;
+ }
+}
+
+} // namespace doris::io
diff --git a/be/src/io/fs/http_file_system.h b/be/src/io/fs/http_file_system.h
new file mode 100644
index 00000000000..e221d3b915d
--- /dev/null
+++ b/be/src/io/fs/http_file_system.h
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <filesystem>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "common/status.h"
+#include "http/http_client.h"
+#include "http_file_reader.h"
+#include "io/fs/file_writer.h"
+#include "io/fs/path.h"
+#include "io/fs/remote_file_system.h"
+
+namespace doris::io {
+class HttpFileSystem final : public RemoteFileSystem {
+public:
+ static Result<std::shared_ptr<HttpFileSystem>> create(
+ std::string id, const std::string& uri,
+ const std::map<std::string, std::string>& properties = {});
+ ~HttpFileSystem() override = default;
+
+protected:
+ Status file_size_impl(const Path& file, int64_t* file_size) const override;
+
+ Status exists_impl(const Path& path, bool* res) const override;
+
+ Status open_file_internal(const Path& file, FileReaderSPtr* reader,
+ const FileReaderOptions& opts) override;
+ Status download_impl(const Path& remote_file, const Path& local_file)
override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status batch_upload_impl(const std::vector<Path>& local_files,
+ const std::vector<Path>& remote_files) override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status upload_impl(const Path& local_file, const Path& remote_file)
override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status open_file_impl(const Path& file, FileReaderSPtr* reader,
+ const FileReaderOptions* opts) override {
+ return Status::NotSupported("not suported");
+ }
+
+ Status create_directory_impl(const Path& dir, bool failed_if_exists =
false) override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status delete_file_impl(const Path& file) override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status batch_delete_impl(const std::vector<Path>& files) override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status delete_directory_impl(const Path& dir) override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status create_file_impl(const Path& file, FileWriterPtr* writer,
+ const FileWriterOptions* opts) override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status list_impl(const Path& dir, bool only_file, std::vector<FileInfo>*
files,
+ bool* exists) override {
+ return Status::NotSupported("not supported");
+ }
+
+ Status rename_impl(const Path& orig_name, const Path& new_name) override {
+ return Status::NotSupported("not supported");
+ }
+
+private:
+ HttpFileSystem(Path&& root_path, std::string id, std::map<std::string,
std::string> properties);
+ Status _init(const std::string& url);
+
+ std::string _url;
+ std::map<std::string, std::string> _properties;
+};
+} // namespace doris::io
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index ed3e83cceab..3874aa20d9c 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -491,6 +491,7 @@ Status
NewJsonReader::_read_one_message(DorisUniqueBufferPtr<uint8_t>* file_buf,
case TFileType::FILE_LOCAL:
[[fallthrough]];
case TFileType::FILE_HDFS:
+ case TFileType::FILE_HTTP:
[[fallthrough]];
case TFileType::FILE_S3: {
size_t file_size = _file_reader->size();
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
index a16ac12510f..b0fa82c409a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
@@ -132,7 +132,8 @@ public class StorageBackend implements ParseNode {
GFS("Tencent Goose File System"),
JFS("Juicefs"),
STREAM("Stream load pipe"),
- AZURE("MicroSoft Azure Blob");
+ AZURE("MicroSoft Azure Blob"),
+ HTTP("HTTP");
@SerializedName("desc")
private final String description;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java
index d4bdf92696b..b46ee271c59 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java
@@ -24,6 +24,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.table.Frontends;
import
org.apache.doris.nereids.trees.expressions.functions.table.FrontendsDisks;
import org.apache.doris.nereids.trees.expressions.functions.table.GroupCommit;
import org.apache.doris.nereids.trees.expressions.functions.table.Hdfs;
+import org.apache.doris.nereids.trees.expressions.functions.table.Http;
import org.apache.doris.nereids.trees.expressions.functions.table.HttpStream;
import org.apache.doris.nereids.trees.expressions.functions.table.HudiMeta;
import org.apache.doris.nereids.trees.expressions.functions.table.IcebergMeta;
@@ -67,7 +68,8 @@ public class BuiltinTableValuedFunctions implements
FunctionHelper {
tableValued(Tasks.class, "tasks"),
tableValued(Query.class, "query"),
tableValued(PartitionValues.class, "partition_values"),
- tableValued(File.class, "file")
+ tableValued(File.class, "file"),
+ tableValued(Http.class, "http")
);
public static final BuiltinTableValuedFunctions INSTANCE = new
BuiltinTableValuedFunctions();
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
index fcb83feb49c..38e59bf9150 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
@@ -451,7 +451,8 @@ public abstract class FileQueryScanNode extends
FileScanNode {
}
}
}
- } else if ((locationType == TFileType.FILE_S3 || locationType ==
TFileType.FILE_LOCAL)
+ } else if ((locationType == TFileType.FILE_S3 || locationType ==
TFileType.FILE_LOCAL
+ || locationType == TFileType.FILE_HTTP)
&& !params.isSetProperties()) {
params.setProperties(locationProperties);
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/constants/MCProperties.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/constants/MCProperties.java
index bdfd9793c53..70feb48f2f7 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/constants/MCProperties.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/constants/MCProperties.java
@@ -43,13 +43,11 @@ public class MCProperties extends BaseProperties {
public static final String QUOTA = "mc.quota";
public static final String DEFAULT_QUOTA = "pay-as-you-go";
-
public static final String SPLIT_STRATEGY = "mc.split_strategy";
public static final String SPLIT_BY_BYTE_SIZE_STRATEGY = "byte_size";
public static final String SPLIT_BY_ROW_COUNT_STRATEGY = "row_count";
public static final String DEFAULT_SPLIT_STRATEGY =
SPLIT_BY_BYTE_SIZE_STRATEGY;
-
public static final String SPLIT_BYTE_SIZE = "mc.split_byte_size";
public static final String DEFAULT_SPLIT_BYTE_SIZE = "268435456"; //256 *
1024L * 1024L = 256MB
public static final String SPLIT_ROW_COUNT = "mc.split_row_count";
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HttpProperties.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HttpProperties.java
new file mode 100644
index 00000000000..b6b9eaa63c6
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HttpProperties.java
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.datasource.property.storage;
+
+import org.apache.doris.common.UserException;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Maps;
+import org.apache.hudi.common.util.MapUtils;
+
+import java.util.Map;
+import java.util.Set;
+
+public class HttpProperties extends StorageProperties {
+ private static final ImmutableSet<String> HTTP_PROPERTIES = new
ImmutableSet.Builder<String>()
+ .add(StorageProperties.FS_HTTP_SUPPORT)
+ .build();
+
+ public HttpProperties(Map<String, String> origProps) {
+ super(Type.HTTP, origProps);
+ }
+
+ @Override
+ public Map<String, String> getBackendConfigProperties() {
+ return origProps;
+ }
+
+ @Override
+ public String validateAndNormalizeUri(String url) throws UserException {
+ if (url == null || (!url.startsWith("http://") &&
!url.startsWith("https://") && !url.startsWith("hf://"))) {
+ throw new UserException("Invalid http/hf url: " + url);
+ }
+ return url;
+ }
+
+ @Override
+ public String validateAndGetUri(Map<String, String> props) throws
UserException {
+ String url = props.get(URI_KEY);
+ return validateAndNormalizeUri(url);
+ }
+
+ public static boolean guessIsMe(Map<String, String> props) {
+ return !MapUtils.isNullOrEmpty(props)
+ && HTTP_PROPERTIES.stream().anyMatch(props::containsKey);
+ }
+
+ public String getUri() {
+ return origProps.get(URI_KEY);
+ }
+
+ @Override
+ public String getStorageName() {
+ return "http";
+ }
+
+ @Override
+ public void initializeHadoopStorageConfig() {
+ // not used
+ hadoopStorageConfig = null;
+ }
+
+ @Override
+ protected Set<String> schemas() {
+ return ImmutableSet.of("http");
+ }
+
+ public Map<String, String> getHeaders() {
+ Map<String, String> headers = Maps.newHashMap();
+ for (Map.Entry<String, String> entry : origProps.entrySet()) {
+ if (entry.getKey().toLowerCase().startsWith("http.header.")) {
+ String headerKey =
entry.getKey().substring("http.header.".length());
+ headers.put(headerKey, entry.getValue());
+ }
+ }
+ return headers;
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/StorageProperties.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/StorageProperties.java
index cedfe8388b3..3e8a703204b 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/StorageProperties.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/StorageProperties.java
@@ -49,6 +49,8 @@ public abstract class StorageProperties extends
ConnectionProperties {
public static final String FS_COS_SUPPORT = "fs.cos.support";
public static final String FS_OSS_HDFS_SUPPORT = "fs.oss-hdfs.support";
public static final String FS_LOCAL_SUPPORT = "fs.local.support";
+ public static final String FS_HTTP_SUPPORT = "fs.http.support";
+
public static final String DEPRECATED_OSS_HDFS_SUPPORT =
"oss.hdfs.enabled";
protected static final String URI_KEY = "uri";
@@ -68,6 +70,7 @@ public abstract class StorageProperties extends
ConnectionProperties {
AZURE,
BROKER,
LOCAL,
+ HTTP,
UNKNOWN
}
@@ -197,8 +200,10 @@ public abstract class StorageProperties extends
ConnectionProperties {
props -> (isFsSupport(props, FS_BROKER_SUPPORT)
|| BrokerProperties.guessIsMe(props)) ? new
BrokerProperties(props) : null,
props -> (isFsSupport(props, FS_LOCAL_SUPPORT)
- || LocalProperties.guessIsMe(props)) ? new
LocalProperties(props) : null
- );
+ || LocalProperties.guessIsMe(props)) ? new
LocalProperties(props) : null,
+ props -> (isFsSupport(props, FS_HTTP_SUPPORT)
+ || HttpProperties.guessIsMe(props)) ? new
HttpProperties(props) : null
+ );
protected StorageProperties(Type type, Map<String, String> origProps) {
super(origProps);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemType.java
b/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemType.java
index ffd94619e61..1a24c71ae29 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemType.java
@@ -45,5 +45,6 @@ public enum FileSystemType {
JFS,
BROKER,
FILE,
- AZURE
+ AZURE,
+ HTTP
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java
b/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java
index a89e8be1ac9..0686f977d4d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java
@@ -77,7 +77,9 @@ public enum SchemaTypeMapper {
WASB("wasb", StorageProperties.Type.AZURE, FileSystemType.S3,
TFileType.FILE_S3),
WASBS("wasbs", StorageProperties.Type.AZURE, FileSystemType.S3,
TFileType.FILE_S3),
HDFS("hdfs", StorageProperties.Type.HDFS, FileSystemType.HDFS,
TFileType.FILE_HDFS),
- LOCAL("local", StorageProperties.Type.HDFS, FileSystemType.HDFS,
TFileType.FILE_HDFS);
+ LOCAL("local", StorageProperties.Type.HDFS, FileSystemType.HDFS,
TFileType.FILE_HDFS),
+ HTTP("http", StorageProperties.Type.HTTP, FileSystemType.HTTP,
TFileType.FILE_HTTP),
+ HTTPS("https", StorageProperties.Type.HTTP, FileSystemType.HTTP,
TFileType.FILE_HTTP);
//LAKEFS("lakefs", StorageProperties.Type.LAKEFS),
//GCS("gs", StorageProperties.Type.S3),
//BOS("bos", StorageProperties.Type.BOS),
@@ -156,3 +158,4 @@ public enum SchemaTypeMapper {
return SCHEMA_TO_FILE_TYPE_MAP.get(schema.toLowerCase());
}
}
+
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java
b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java
index 0c362279ab1..0dffb612217 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java
@@ -20,6 +20,7 @@ package org.apache.doris.httpv2.rest.manager;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.Config;
import org.apache.doris.common.Pair;
+import org.apache.doris.common.util.Util;
import org.apache.doris.httpv2.entity.ResponseBody;
import org.apache.doris.persist.gson.GsonUtils;
import org.apache.doris.system.Frontend;
@@ -37,8 +38,12 @@ import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
@@ -49,6 +54,8 @@ import java.util.stream.Collectors;
* used to forward http requests from manager to be.
*/
public class HttpUtils {
+ private static final Logger LOG = LogManager.getLogger(HttpUtils.class);
+
static final int REQUEST_SUCCESS_CODE = 0;
static final int DEFAULT_TIME_OUT_MS = 2000;
@@ -138,4 +145,66 @@ public class HttpUtils {
public static String getBody(HttpServletRequest request) throws
IOException {
return IOUtils.toString(request.getInputStream(),
StandardCharsets.UTF_8);
}
+
+ /**
+ * Get the file size of the HTTP resource by sending a HEAD request.
+ * This method uses HTTP HEAD request to get the Content-Length header
+ * without downloading the entire file content.
+ * @param uri the HTTP URI to get file size for
+ * @return the file size in bytes, or -1 if the size cannot be determined
+ * @throws IOException if there's an error connecting to the HTTP resource
+ * @throws IllegalArgumentException if the URI is null or invalid
+ */
+ public static long getHttpFileSize(String uri, Map<String, String>
headers) throws IOException {
+ if (uri == null || uri.trim().isEmpty()) {
+ throw new IllegalArgumentException("HTTP URI is null or empty");
+ }
+
+ HttpURLConnection connection = null;
+ try {
+ URL url = new URL(uri);
+ connection = (HttpURLConnection) url.openConnection();
+
+ // Use HEAD request to get headers without downloading content
+ connection.setRequestMethod("HEAD");
+ connection.setConnectTimeout(10000); // 10 seconds connection
timeout
+ connection.setReadTimeout(30000); // 30 seconds read timeout
+
+ // Set common headers
+ connection.setRequestProperty("User-Agent", "Doris-HttpUtils/1.0");
+ connection.setRequestProperty("Accept", "*/*");
+ for (Map.Entry<String, String> entry : headers.entrySet()) {
+ connection.setRequestProperty(entry.getKey(),
entry.getValue());
+ }
+
+ // Connect and get response
+ connection.connect();
+ int responseCode = connection.getResponseCode();
+
+ if (responseCode == HttpURLConnection.HTTP_OK) {
+ // Try to get Content-Length header
+ String contentLengthStr =
connection.getHeaderField("Content-Length");
+ if (contentLengthStr != null &&
!contentLengthStr.trim().isEmpty()) {
+ try {
+ return Long.parseLong(contentLengthStr.trim());
+ } catch (NumberFormatException e) {
+ throw new IOException("Invalid Content-Length header:
" + contentLengthStr, e);
+ }
+ } else {
+ // Content-Length header not available
+ return -1;
+ }
+ } else {
+ throw new IOException("HTTP request failed with response code:
" + responseCode
+ + ", message: " + connection.getResponseMessage());
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to get file size for URI: {}", uri, e);
+ throw new IOException("Failed to get file size for URI: " + uri +
". " + Util.getRootCauseMessage(e), e);
+ } finally {
+ if (connection != null) {
+ connection.disconnect();
+ }
+ }
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Http.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Http.java
new file mode 100644
index 00000000000..3c5aeb4b687
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Http.java
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.table;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.exceptions.AnalysisException;
+import org.apache.doris.nereids.trees.expressions.Properties;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.coercion.AnyDataType;
+import org.apache.doris.tablefunction.HttpTableValuedFunction;
+import org.apache.doris.tablefunction.TableValuedFunctionIf;
+
+import java.util.Map;
+
+/**
+ * http tvf
+ */
+public class Http extends TableValuedFunction {
+ public Http(Properties properties) {
+ super("http", properties);
+ }
+
+ @Override
+ protected TableValuedFunctionIf toCatalogFunction() {
+ try {
+ Map<String, String> arguments = getTVFProperties().getMap();
+ return new HttpTableValuedFunction(arguments);
+ } catch (Throwable t) {
+ throw new AnalysisException("Can not build http(): " +
t.getMessage(), t);
+ }
+ }
+
+ @Override
+ public FunctionSignature customSignature() {
+ return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX,
getArgumentsTypes());
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitHttp(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java
index a1cd65a8e8f..e831be0a6cf 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java
@@ -24,6 +24,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.table.Frontends;
import
org.apache.doris.nereids.trees.expressions.functions.table.FrontendsDisks;
import org.apache.doris.nereids.trees.expressions.functions.table.GroupCommit;
import org.apache.doris.nereids.trees.expressions.functions.table.Hdfs;
+import org.apache.doris.nereids.trees.expressions.functions.table.Http;
import org.apache.doris.nereids.trees.expressions.functions.table.HttpStream;
import org.apache.doris.nereids.trees.expressions.functions.table.HudiMeta;
import org.apache.doris.nereids.trees.expressions.functions.table.IcebergMeta;
@@ -75,6 +76,10 @@ public interface TableValuedFunctionVisitor<R, C> {
return visitTableValuedFunction(tasks, context);
}
+ default R visitHttp(Http http, C context) {
+ return visitTableValuedFunction(http, context);
+ }
+
default R visitFrontendsDisks(FrontendsDisks frontendsDisks, C context) {
return visitTableValuedFunction(frontendsDisks, context);
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/FileTableValuedFunction.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/FileTableValuedFunction.java
index 3622056aa45..2cb5bedb446 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/FileTableValuedFunction.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/FileTableValuedFunction.java
@@ -25,6 +25,7 @@ import org.apache.doris.common.UserException;
import
org.apache.doris.datasource.property.storage.AbstractS3CompatibleProperties;
import org.apache.doris.datasource.property.storage.AzureProperties;
import org.apache.doris.datasource.property.storage.HdfsCompatibleProperties;
+import org.apache.doris.datasource.property.storage.HttpProperties;
import org.apache.doris.datasource.property.storage.LocalProperties;
import org.apache.doris.datasource.property.storage.StorageProperties;
import org.apache.doris.planner.PlanNodeId;
@@ -58,6 +59,8 @@ public class FileTableValuedFunction extends
ExternalFileTableValuedFunction {
delegateTvf = new HdfsTableValuedFunction(properties);
} else if (this.storageProperties instanceof LocalProperties) {
delegateTvf = new LocalTableValuedFunction(properties);
+ } else if (this.storageProperties instanceof HttpProperties) {
+ delegateTvf = new HttpTableValuedFunction(properties);
} else {
throw new AnalysisException("Could not find storage_type: " +
storageProperties);
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HFUtils.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HFUtils.java
new file mode 100644
index 00000000000..f97879632d4
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HFUtils.java
@@ -0,0 +1,755 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.tablefunction;
+
+import org.apache.doris.common.AnalysisException;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Strings;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.util.EntityUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.IOException;
+import java.nio.file.FileSystems;
+import java.nio.file.PathMatcher;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Utility class for handling HuggingFace URLs and converting them to HTTP
URLs.
+ *
+ * This class provides functionality to parse hf:// URLs and convert them to
+ * actual HTTP URLs that can be used to access files on HuggingFace Hub.
+ *
+ * Supported URL formats:
+ * - hf://datasets/username/dataset-name/path/to/file.parquet
+ * - hf://datasets/username/dataset-name@revision/path/to/file.parquet
+ * - hf://spaces/username/space-name/path/to/file.txt
+ *
+ * Example usage:
+ * String hfUrl = "hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+ * String httpUrl = HFUtils.convertHfUrlToHttpUrl(hfUrl);
+ * // Returns:
https://huggingface.co/datasets/lhoestq/demo1/resolve/main/default/train/0000.parquet
+ */
+public class HFUtils {
+ private static final Logger LOG = LogManager.getLogger(HFUtils.class);
+
+ // Constants
+ private static final String HF_SCHEME = "hf://";
+ private static final String DEFAULT_ENDPOINT = "https://huggingface.co";
+ private static final String DEFAULT_REVISION = "main";
+ private static final String REPO_TYPE_DATASETS = "datasets";
+ private static final String REPO_TYPE_SPACES = "spaces";
+
+ // HTTP Client Configuration
+ private static final int DEFAULT_TIMEOUT_MS = 30000; // 30 seconds
+ private static final int DEFAULT_CONNECT_TIMEOUT_MS = 10000; // 10 seconds
+ private static final int DEFAULT_PAGE_LIMIT = 1000;
+
+ /**
+ * Parsed HuggingFace URL components
+ */
+ public static class ParsedHFUrl {
+ private String endpoint = DEFAULT_ENDPOINT;
+ private String repoType;
+ private String repository;
+ private String revision = DEFAULT_REVISION;
+ private String path;
+
+ public String getEndpoint() {
+ return endpoint;
+ }
+
+ public void setEndpoint(String endpoint) {
+ this.endpoint = endpoint;
+ }
+
+ public String getRepoType() {
+ return repoType;
+ }
+
+ public void setRepoType(String repoType) {
+ this.repoType = repoType;
+ }
+
+ public String getRepository() {
+ return repository;
+ }
+
+ public void setRepository(String repository) {
+ this.repository = repository;
+ }
+
+ public String getRevision() {
+ return revision;
+ }
+
+ public void setRevision(String revision) {
+ this.revision = revision;
+ }
+
+ public String getPath() {
+ return path;
+ }
+
+ public void setPath(String path) {
+ this.path = path;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("ParsedHFUrl{endpoint='%s', repoType='%s',
repository='%s', revision='%s', path='%s'}",
+ endpoint, repoType, repository, revision, path);
+ }
+ }
+
+ /**
+ * Convert a HuggingFace URL to an HTTP URL
+ *
+ * @param hfUrl The hf:// URL to convert
+ * @return The corresponding HTTP URL
+ * @throws AnalysisException if the URL format is invalid
+ */
+ @VisibleForTesting
+ public static String convertHfUrlToHttpUrl(String hfUrl) throws
AnalysisException {
+ if (Strings.isNullOrEmpty(hfUrl)) {
+ throw new AnalysisException("HuggingFace URL cannot be null or
empty");
+ }
+
+ ParsedHFUrl parsedUrl = parseHfUrl(hfUrl);
+ return buildHttpUrl(parsedUrl);
+ }
+
+ /**
+ * Parse a HuggingFace URL into its components
+ *
+ * @param url The hf:// URL to parse
+ * @return ParsedHFUrl object containing the parsed components
+ * @throws AnalysisException if the URL format is invalid
+ */
+ @VisibleForTesting
+ public static ParsedHFUrl parseHfUrl(String url) throws AnalysisException {
+ if (Strings.isNullOrEmpty(url)) {
+ throw new AnalysisException("URL cannot be null or empty");
+ }
+
+ if (!url.startsWith(HF_SCHEME)) {
+ throw new AnalysisException("URL must start with 'hf://', got: " +
url);
+ }
+
+ ParsedHFUrl result = new ParsedHFUrl();
+
+ // Remove the hf:// prefix
+ String remaining = url.substring(HF_SCHEME.length());
+
+ if (remaining.isEmpty()) {
+ throwParseError(url);
+ }
+
+ String[] parts = remaining.split("/", -1); // -1 to keep empty strings
+
+ if (parts.length < 4) {
+ throwParseError(url);
+ }
+
+ // Parse repository type
+ result.setRepoType(parts[0]);
+ if (!REPO_TYPE_DATASETS.equals(result.getRepoType()) &&
!REPO_TYPE_SPACES.equals(result.getRepoType())) {
+ throw new AnalysisException(
+ String.format("Currently only supports 'datasets' and 'spaces'
repository types, got: '%s' in URL: %s",
+ result.getRepoType(), url));
+ }
+
+ // Parse username and repository name
+ String username = parts[1];
+ String repoName = parts[2];
+
+ if (username.isEmpty() || repoName.isEmpty()) {
+ throwParseError(url);
+ }
+
+ // Check if repository name contains revision
+ int atIndex = repoName.indexOf('@');
+ if (atIndex != -1) {
+ String actualRepoName = repoName.substring(0, atIndex);
+ result.setRevision(repoName.substring(atIndex + 1));
+
+ if (actualRepoName.isEmpty() || result.getRevision().isEmpty()) {
+ throwParseError(url);
+ }
+ result.setRepository(username + "/" + actualRepoName);
+ } else {
+ result.setRepository(username + "/" + repoName);
+ }
+
+ // Build the path from remaining parts
+ StringBuilder pathBuilder = new StringBuilder();
+ for (int i = 3; i < parts.length; i++) {
+ pathBuilder.append("/").append(parts[i]);
+ }
+ String rawPath = pathBuilder.toString();
+
+ // Handle HuggingFace web interface paths like /blob/main/ or
/tree/main/
+ // These should be converted to proper API paths
+ if (rawPath.startsWith("/blob/") || rawPath.startsWith("/tree/")) {
+ // Extract revision and actual path
+ String[] pathParts = rawPath.substring(1).split("/", 3); // Remove
leading slash and split
+ if (pathParts.length >= 2) {
+ // pathParts[0] is "blob" or "tree" - we don't need to use it
+ String pathRevision = pathParts[1]; // revision like "main"
+ String actualPath = pathParts.length > 2 ? "/" + pathParts[2]
: "";
+
+ // Use the revision from the path if not already specified via
@
+ if (result.getRevision().equals(DEFAULT_REVISION)) {
+ result.setRevision(pathRevision);
+ }
+
+ result.setPath(actualPath);
+ } else {
+ result.setPath(rawPath);
+ }
+ } else {
+ result.setPath(rawPath);
+ }
+
+ // If no path parts exist, set to empty string
+ if (result.getPath().isEmpty()) {
+ result.setPath("");
+ }
+ // Note: if path is "/" (from trailing slash), keep it as is
+
+ LOG.debug("Parsed HF URL: {} -> {}", url, result);
+ return result;
+ }
+
+ /**
+ * Build HTTP URL from parsed HF URL components
+ *
+ * @param parsedUrl The parsed HF URL components
+ * @return The HTTP URL string
+ */
+ private static String buildHttpUrl(ParsedHFUrl parsedUrl) {
+ // URL format:
{endpoint}/{repo_type}/{repository}/resolve/{revision}{path}
+ StringBuilder httpUrl = new StringBuilder();
+
+ httpUrl.append(parsedUrl.getEndpoint());
+ if (!parsedUrl.getEndpoint().endsWith("/")) {
+ httpUrl.append("/");
+ }
+
+ httpUrl.append(parsedUrl.getRepoType()).append("/");
+ httpUrl.append(parsedUrl.getRepository()).append("/");
+ httpUrl.append("resolve").append("/");
+ httpUrl.append(parsedUrl.getRevision());
+ httpUrl.append(parsedUrl.getPath());
+
+ String result = httpUrl.toString();
+ LOG.debug("Built HTTP URL: {}", result);
+ return result;
+ }
+
+ /**
+ * Validate if a URL is a valid HuggingFace URL
+ *
+ * @param url The URL to validate
+ * @return true if it's a valid hf:// URL, false otherwise
+ */
+ @VisibleForTesting
+ public static boolean isValidHfUrl(String url) {
+ if (Strings.isNullOrEmpty(url)) {
+ return false;
+ }
+
+ try {
+ parseHfUrl(url);
+ return true;
+ } catch (AnalysisException e) {
+ LOG.debug("Invalid HF URL: {}, error: {}", url, e.getMessage());
+ return false;
+ }
+ }
+
+ /**
+ * Get the tree API URL for listing files in a repository
+ * This is useful for implementing glob patterns or directory listing
+ *
+ * @param parsedUrl The parsed HF URL components
+ * @param limit Optional limit for the number of results (0 means no limit)
+ * @return The tree API URL
+ */
+ @VisibleForTesting
+ public static String buildTreeApiUrl(ParsedHFUrl parsedUrl, int limit) {
+ return buildTreeApiUrl(parsedUrl, parsedUrl.getPath(), limit);
+ }
+
+ /**
+ * Get the tree API URL for listing files in a specific path
+ *
+ * @param parsedUrl The parsed HF URL components
+ * @param path The specific path to list
+ * @param limit Optional limit for the number of results (0 means no limit)
+ * @return The tree API URL
+ */
+ private static String buildTreeApiUrl(ParsedHFUrl parsedUrl, String path,
int limit) {
+ // URL format:
{endpoint}/api/{repo_type}/{repository}/tree/{revision}{path}
+ StringBuilder treeUrl = new StringBuilder();
+
+ treeUrl.append(parsedUrl.getEndpoint());
+ if (!parsedUrl.getEndpoint().endsWith("/")) {
+ treeUrl.append("/");
+ }
+
+ treeUrl.append("api").append("/");
+ treeUrl.append(parsedUrl.getRepoType()).append("/");
+ treeUrl.append(parsedUrl.getRepository()).append("/");
+ treeUrl.append("tree").append("/");
+ treeUrl.append(parsedUrl.getRevision());
+
+ // Add path if provided
+ if (!Strings.isNullOrEmpty(path) && !"/".equals(path)) {
+ if (!path.startsWith("/")) {
+ treeUrl.append("/");
+ }
+ treeUrl.append(path);
+ }
+
+ if (limit > 0) {
+ treeUrl.append("?limit=").append(limit);
+ }
+
+ String result = treeUrl.toString();
+ LOG.debug("Built tree API URL: {}", result);
+ return result;
+ }
+
+ /**
+ * Extract repository information from HF URL for display purposes
+ *
+ * @param hfUrl The hf:// URL
+ * @return A human-readable repository description
+ * @throws AnalysisException if the URL is invalid
+ */
+ @VisibleForTesting
+ public static String getRepositoryInfo(String hfUrl) throws
AnalysisException {
+ ParsedHFUrl parsed = parseHfUrl(hfUrl);
+ return String.format("%s/%s@%s", parsed.getRepoType(),
parsed.getRepository(), parsed.getRevision());
+ }
+
+ /**
+ * Expand a HuggingFace URL with glob patterns to matching file URLs
+ *
+ * @param hfGlobUrl The hf:// URL with glob patterns
+ * @return List of HTTP URLs that match the glob pattern
+ * @throws AnalysisException if the URL format is invalid or glob
processing fails
+ */
+ public static List<String> expandGlob(String hfGlobUrl) throws
AnalysisException {
+ return expandGlob(hfGlobUrl, null);
+ }
+
+ /**
+ * Expand a HuggingFace URL with glob patterns to matching file URLs
+ *
+ * @param hfGlobUrl The hf:// URL with glob patterns
+ * @param authToken Optional authentication token for private repositories
+ * @return List of HTTP URLs that match the glob pattern
+ * @throws AnalysisException if the URL format is invalid or glob
processing fails
+ */
+ public static List<String> expandGlob(String hfGlobUrl, String authToken)
throws AnalysisException {
+ if (Strings.isNullOrEmpty(hfGlobUrl)) {
+ throw new AnalysisException("HuggingFace glob URL cannot be null
or empty");
+ }
+
+ // Parse the glob URL
+ ParsedHFUrl parsedUrl = parseHfUrl(hfGlobUrl);
+
+ // Check if the path contains wildcard characters
+ String path = parsedUrl.getPath();
+ if (!containsWildcards(path)) {
+ // No wildcards, return the single file
+ List<String> result = new ArrayList<>();
+ result.add(buildHttpUrl(parsedUrl));
+ return result;
+ }
+
+ // Find the longest prefix without wildcards
+ String sharedPath = getLongestPrefixWithoutWildcards(path);
+
+ // Prepare headers
+ Map<String, String> headers = new HashMap<>();
+ if (!Strings.isNullOrEmpty(authToken)) {
+ headers.put("Authorization", "Bearer " + authToken);
+ }
+
+ List<String> result = new ArrayList<>();
+
+ try {
+ // Get all files and directories to process
+ List<String> pathsToProcess = new ArrayList<>();
+ pathsToProcess.add(sharedPath);
+
+ List<String> allFilePaths = new ArrayList<>();
+
+ // Calculate the depth needed for recursion
+ // Count the number of path components in the pattern after the
shared prefix
+ String remainingPattern = path.substring(sharedPath.length());
+ int patternDepth = splitPath(remainingPattern).size();
+
+ // If pattern contains **, we need unlimited recursion
+ boolean unlimitedRecursion = path.contains("**");
+
+ // For a pattern like /*/*.parquet (depth=2), we need to recurse
into depth 1
+ // to list files at depth 2. So maxRecursionDepth = patternDepth -
1
+ // But if patternDepth is 1 or less, we still need depth 0, so use
Math.max
+ int maxRecursionDepth = unlimitedRecursion ? Integer.MAX_VALUE :
Math.max(0, patternDepth - 1);
+
+ // Track depth for each path being processed
+ Map<String, Integer> pathDepths = new HashMap<>();
+ pathDepths.put(sharedPath, 0);
+
+ // Process directories recursively if needed
+ while (!pathsToProcess.isEmpty()) {
+ String currentPath = pathsToProcess.remove(0);
+ int currentDepth = pathDepths.getOrDefault(currentPath, 0);
+
+ // List files in current directory
+ List<String> files = new ArrayList<>();
+ List<String> directories = new ArrayList<>();
+ listHuggingFaceFiles(parsedUrl, currentPath, headers, files,
directories);
+
+ // Add all file paths
+ allFilePaths.addAll(files);
+
+ // Add directories for recursive processing based on pattern
depth
+ // We need to recurse if current depth is less than max
recursion depth
+ if (currentDepth < maxRecursionDepth) {
+ for (String dir : directories) {
+ pathsToProcess.add(dir);
+ pathDepths.put(dir, currentDepth + 1);
+ }
+ }
+ }
+
+ // Filter files using glob pattern matching
+ List<String> patternComponents = splitPath(path);
+
+ for (String filePath : allFilePaths) {
+ List<String> fileComponents = splitPath(filePath);
+
+ if (matchPathComponents(fileComponents, patternComponents)) {
+ // Build the complete HTTP URL for the matched file
+ ParsedHFUrl fileUrl = new ParsedHFUrl();
+ fileUrl.setEndpoint(parsedUrl.getEndpoint());
+ fileUrl.setRepoType(parsedUrl.getRepoType());
+ fileUrl.setRepository(parsedUrl.getRepository());
+ fileUrl.setRevision(parsedUrl.getRevision());
+ fileUrl.setPath(filePath);
+
+ String httpUrl = buildHttpUrl(fileUrl);
+ result.add(httpUrl);
+ }
+ }
+
+ } catch (Exception e) {
+ throw new AnalysisException("Failed to expand glob pattern: " +
e.getMessage());
+ }
+
+ return result;
+ }
+
+ /**
+ * Create HTTP client with proper configuration
+ */
+ private static CloseableHttpClient createHttpClient() {
+ RequestConfig config = RequestConfig.custom()
+ .setConnectTimeout(DEFAULT_CONNECT_TIMEOUT_MS)
+ .setConnectionRequestTimeout(DEFAULT_TIMEOUT_MS)
+ .setSocketTimeout(DEFAULT_TIMEOUT_MS)
+ .build();
+
+ return HttpClientBuilder.create()
+ .setDefaultRequestConfig(config)
+ .build();
+ }
+
+ /**
+ * Execute HTTP GET request
+ */
+ private static String executeHttpGet(String url, Map<String, String>
headers) throws IOException {
+ try (CloseableHttpClient client = createHttpClient()) {
+ HttpGet httpGet = new HttpGet(url);
+
+ // Set headers
+ if (headers != null) {
+ for (Map.Entry<String, String> entry : headers.entrySet()) {
+ httpGet.setHeader(entry.getKey(), entry.getValue());
+ }
+ }
+
+ // Set User-Agent
+ httpGet.setHeader("User-Agent", "Doris-HFUtils/1.0");
+
+ return client.execute(httpGet, response -> {
+ int statusCode = response.getStatusLine().getStatusCode();
+ String responseBody =
EntityUtils.toString(response.getEntity());
+
+ if (statusCode >= 400) {
+ throw new IOException("HTTP " + statusCode + ": " +
responseBody);
+ }
+
+ return responseBody;
+ });
+ }
+ }
+
+ /**
+ * List files from HuggingFace API with pagination support
+ */
+ private static void listHuggingFaceFiles(ParsedHFUrl parsedUrl, String
path,
+ Map<String, String> headers,
+ List<String> files, List<String>
directories) throws AnalysisException {
+ // Build API URL
+ String apiUrl = buildTreeApiUrl(parsedUrl, path, DEFAULT_PAGE_LIMIT);
+
+ String nextUrl = apiUrl;
+ int pageCount = 0;
+
+ while (nextUrl != null && pageCount < 100) { // Prevent infinite loops
+ try {
+ String response = executeHttpGet(nextUrl, headers);
+
+ // Parse JSON response
+ JsonArray jsonArray =
JsonParser.parseString(response).getAsJsonArray();
+
+ for (JsonElement element : jsonArray) {
+ JsonObject obj = element.getAsJsonObject();
+
+ String filePath = "/" + obj.get("path").getAsString();
+ String type = obj.get("type").getAsString();
+
+ if ("file".equals(type)) {
+ files.add(filePath);
+ } else if ("directory".equals(type)) {
+ directories.add(filePath);
+ }
+ }
+
+ // For simplicity, we don't handle pagination in this basic
version
+ // In a real implementation, you would parse Link headers here
+ nextUrl = null;
+ pageCount++;
+
+ } catch (Exception e) {
+ throw new AnalysisException("Failed to list files from
HuggingFace API: " + e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Check if a path contains wildcard characters
+ *
+ * @param path The path to check
+ * @return true if the path contains wildcards, false otherwise
+ */
+ @VisibleForTesting
+ public static boolean containsWildcards(String path) {
+ if (Strings.isNullOrEmpty(path)) {
+ return false;
+ }
+ return path.contains("*") || path.contains("?") || path.contains("[")
|| path.contains("{");
+ }
+
+ /**
+ * Get the longest prefix of a path that doesn't contain wildcards
+ *
+ * @param path The path to analyze
+ * @return The longest prefix without wildcards
+ */
+ @VisibleForTesting
+ public static String getLongestPrefixWithoutWildcards(String path) {
+ if (Strings.isNullOrEmpty(path)) {
+ return "";
+ }
+
+ int firstWildcardPos = -1;
+ for (int i = 0; i < path.length(); i++) {
+ char c = path.charAt(i);
+ if (c == '*' || c == '?' || c == '[' || c == '{') {
+ firstWildcardPos = i;
+ break;
+ }
+ }
+
+ if (firstWildcardPos == -1) {
+ return path; // No wildcards found
+ }
+
+ // Find the last slash before the first wildcard
+ String prefix = path.substring(0, firstWildcardPos);
+ int lastSlash = prefix.lastIndexOf('/');
+
+ if (lastSlash == -1) {
+ return ""; // Root path
+ }
+
+ return path.substring(0, lastSlash);
+ }
+
+ /**
+ * Match a file path against a glob pattern
+ * This is a simplified implementation based on DuckDB's Match function
+ *
+ * @param filePath The file path to match
+ * @param globPattern The glob pattern
+ * @return true if the file matches the pattern, false otherwise
+ */
+ @VisibleForTesting
+ public static boolean matchGlobPattern(String filePath, String
globPattern) {
+ if (Strings.isNullOrEmpty(filePath) ||
Strings.isNullOrEmpty(globPattern)) {
+ return false;
+ }
+
+ try {
+ // Use Java's built-in glob pattern matching
+ PathMatcher matcher =
FileSystems.getDefault().getPathMatcher("glob:" + globPattern);
+ return matcher.matches(Paths.get(filePath));
+ } catch (Exception e) {
+ LOG.warn("Failed to match glob pattern: {} against file: {},
error: {}",
+ globPattern, filePath, e.getMessage());
+ return false;
+ }
+ }
+
+ /**
+ * Split a path into components for pattern matching
+ *
+ * @param path The path to split
+ * @return List of path components
+ */
+ @VisibleForTesting
+ public static List<String> splitPath(String path) {
+ if (Strings.isNullOrEmpty(path)) {
+ return new ArrayList<>();
+ }
+
+ List<String> components = new ArrayList<>();
+ String[] parts = path.split("/");
+ for (String part : parts) {
+ if (!part.isEmpty()) {
+ components.add(part);
+ }
+ }
+ return components;
+ }
+
+ /**
+ * Advanced pattern matching similar to DuckDB's Match function
+ * Supports ** for recursive matching
+ *
+ * @param pathComponents The path components to match
+ * @param patternComponents The pattern components
+ * @return true if the path matches the pattern, false otherwise
+ */
+ @VisibleForTesting
+ public static boolean matchPathComponents(List<String> pathComponents,
List<String> patternComponents) {
+ return matchPathComponentsRecursive(pathComponents, 0,
patternComponents, 0);
+ }
+
+ private static boolean matchPathComponentsRecursive(List<String>
pathComponents, int pathIndex,
+ List<String>
patternComponents, int patternIndex) {
+ // Base cases
+ if (pathIndex >= pathComponents.size() && patternIndex >=
patternComponents.size()) {
+ return true; // Both exhausted, match
+ }
+ if (patternIndex >= patternComponents.size()) {
+ return false; // Pattern exhausted but path remains
+ }
+ if (pathIndex >= pathComponents.size()) {
+ // Path exhausted, check if remaining pattern is all **
+ for (int i = patternIndex; i < patternComponents.size(); i++) {
+ if (!"**".equals(patternComponents.get(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ String currentPattern = patternComponents.get(patternIndex);
+
+ if ("**".equals(currentPattern)) {
+ // ** matches zero or more path components
+ if (patternIndex + 1 >= patternComponents.size()) {
+ return true; // ** at end matches everything
+ }
+
+ // Try matching ** with 0, 1, 2, ... path components
+ for (int i = pathIndex; i <= pathComponents.size(); i++) {
+ if (matchPathComponentsRecursive(pathComponents, i,
patternComponents, patternIndex + 1)) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ // Regular pattern matching (including * and [])
+ String currentPath = pathComponents.get(pathIndex);
+ if (matchGlobPattern(currentPath, currentPattern)) {
+ return matchPathComponentsRecursive(pathComponents, pathIndex
+ 1, patternComponents, patternIndex + 1);
+ }
+ return false;
+ }
+ }
+
+ /**
+ * Validate if a URL contains valid glob patterns
+ *
+ * @param hfUrl The hf:// URL to validate
+ * @return true if it's a valid glob URL, false otherwise
+ */
+ @VisibleForTesting
+ public static boolean isValidGlobUrl(String hfUrl) {
+ if (!isValidHfUrl(hfUrl)) {
+ return false;
+ }
+
+ try {
+ ParsedHFUrl parsed = parseHfUrl(hfUrl);
+ return containsWildcards(parsed.getPath());
+ } catch (AnalysisException e) {
+ return false;
+ }
+ }
+
+ private static void throwParseError(String url) throws AnalysisException {
+ throw new AnalysisException(
+ String.format("Failed to parse HuggingFace URL: '%s'. "
+ + "Please format URL like:
'hf://datasets/username/dataset-name/path/to/file.parquet' "
+ + "or
'hf://datasets/username/dataset-name@revision/path/to/file.parquet'", url));
+ }
+}
+
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HttpTableValuedFunction.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HttpTableValuedFunction.java
new file mode 100644
index 00000000000..d8311c822a6
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HttpTableValuedFunction.java
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.tablefunction;
+
+import org.apache.doris.analysis.BrokerDesc;
+import org.apache.doris.analysis.StorageBackend.StorageType;
+import org.apache.doris.common.AnalysisException;
+import org.apache.doris.datasource.property.storage.HttpProperties;
+import org.apache.doris.datasource.property.storage.StorageProperties;
+import org.apache.doris.httpv2.rest.manager.HttpUtils;
+import org.apache.doris.thrift.TBrokerFileStatus;
+import org.apache.doris.thrift.TFileType;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * The Implement of table valued function
+ * http("uri" = "https://example.com/data.csv", "FORMAT" = "csv").
+ */
+public class HttpTableValuedFunction extends ExternalFileTableValuedFunction {
+ public static final String NAME = "http";
+
+ private HttpProperties httpProperties;
+ private String uri;
+
+ public HttpTableValuedFunction(Map<String, String> properties) throws
AnalysisException {
+ Map<String, String> props = super.parseCommonProperties(properties);
+ props.put(StorageProperties.FS_HTTP_SUPPORT, "true");
+ try {
+ this.storageProperties = StorageProperties.createPrimary(props);
+ if (!(storageProperties instanceof HttpProperties)) {
+ throw new AnalysisException("HttpTableValuedFunction only
support http storage properties");
+ }
+
+ this.httpProperties = (HttpProperties) storageProperties;
+ this.uri = this.httpProperties.validateAndGetUri(props);
+
+
this.backendConnectProperties.putAll(storageProperties.getBackendConfigProperties());
+ generateFileStatus();
+ } catch (Exception e) {
+ throw new AnalysisException("Failed check http storage props, " +
e.getMessage(), e);
+ }
+ }
+
+ private void generateFileStatus() throws Exception {
+ this.fileStatuses.clear();
+ if (this.uri.startsWith("http://") || this.uri.startsWith("https://"))
{
+ this.fileStatuses.add(new TBrokerFileStatus(this.uri, false,
+ HttpUtils.getHttpFileSize(this.uri,
this.httpProperties.getHeaders()), true));
+ } else if (this.uri.startsWith("hf://")) {
+ List<String> fileUrls = HFUtils.expandGlob(this.uri);
+ if (LOG.isDebugEnabled()) {
+ for (String fileUrl : fileUrls) {
+ LOG.debug("HttpTableValuedFunction expand hf glob uri:
{}", fileUrl);
+ }
+ }
+ for (String fileUrl : fileUrls) {
+ this.fileStatuses.add(new TBrokerFileStatus(fileUrl, false,
+ HttpUtils.getHttpFileSize(fileUrl,
this.httpProperties.getHeaders()), true));
+ }
+ } else {
+ throw new AnalysisException("HttpTableValuedFunction uri is
invalid: " + this.uri);
+ }
+ }
+
+ @Override
+ public TFileType getTFileType() {
+ return TFileType.FILE_HTTP;
+ }
+
+ @Override
+ public String getFilePath() {
+ if (uri == null) {
+ throw new IllegalArgumentException("HttpTableValuedFunction uri is
null");
+ }
+ return uri;
+ }
+
+ @Override
+ public BrokerDesc getBrokerDesc() {
+ return new BrokerDesc("HttpTvfBroker", StorageType.HTTP,
processedParams);
+ }
+
+ @Override
+ public String getTableName() {
+ return "HttpTableValuedFunction";
+ }
+}
+
+
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java
index 65b187f3830..b7bdbcaa35b 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java
@@ -86,6 +86,8 @@ public abstract class TableValuedFunctionIf {
return new PartitionValuesTableValuedFunction(params);
case FileTableValuedFunction.NAME:
return new FileTableValuedFunction(params);
+ case HttpTableValuedFunction.NAME:
+ return new HttpTableValuedFunction(params);
default:
throw new AnalysisException("Could not find table function " +
funcName);
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/tablefunction/HFUtilsTest.java
b/fe/fe-core/src/test/java/org/apache/doris/tablefunction/HFUtilsTest.java
new file mode 100644
index 00000000000..8d18b021af0
--- /dev/null
+++ b/fe/fe-core/src/test/java/org/apache/doris/tablefunction/HFUtilsTest.java
@@ -0,0 +1,416 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.tablefunction;
+
+import org.apache.doris.common.AnalysisException;
+import org.apache.doris.tablefunction.HFUtils.ParsedHFUrl;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+
+public class HFUtilsTest {
+
+ @Test
+ public void testValidHfUrlParsing() throws AnalysisException {
+ // Test basic dataset URL
+ String url1 = "hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+ ParsedHFUrl parsed1 = HFUtils.parseHfUrl(url1);
+
+ Assert.assertEquals("datasets", parsed1.getRepoType());
+ Assert.assertEquals("lhoestq/demo1", parsed1.getRepository());
+ Assert.assertEquals("main", parsed1.getRevision());
+ Assert.assertEquals("/default/train/0000.parquet", parsed1.getPath());
+ Assert.assertEquals("https://huggingface.co", parsed1.getEndpoint());
+
+ // Test URL with revision
+ String url2 = "hf://datasets/username/[email protected]/path/to/file.csv";
+ ParsedHFUrl parsed2 = HFUtils.parseHfUrl(url2);
+
+ Assert.assertEquals("datasets", parsed2.getRepoType());
+ Assert.assertEquals("username/dataset", parsed2.getRepository());
+ Assert.assertEquals("v1.0", parsed2.getRevision());
+ Assert.assertEquals("/path/to/file.csv", parsed2.getPath());
+
+ // Test spaces URL
+ String url3 = "hf://spaces/gradio/calculator/app.py";
+ ParsedHFUrl parsed3 = HFUtils.parseHfUrl(url3);
+
+ Assert.assertEquals("spaces", parsed3.getRepoType());
+ Assert.assertEquals("gradio/calculator", parsed3.getRepository());
+ Assert.assertEquals("main", parsed3.getRevision());
+ Assert.assertEquals("/app.py", parsed3.getPath());
+
+ // Test URL with empty path
+ String url4 = "hf://datasets/user/repo/";
+ ParsedHFUrl parsed4 = HFUtils.parseHfUrl(url4);
+
+ Assert.assertEquals("datasets", parsed4.getRepoType());
+ Assert.assertEquals("user/repo", parsed4.getRepository());
+ Assert.assertEquals("main", parsed4.getRevision());
+ Assert.assertEquals("/", parsed4.getPath());
+
+ // Test URL with HuggingFace web interface format (/blob/main/)
+ String url5 =
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv";
+ ParsedHFUrl parsed5 = HFUtils.parseHfUrl(url5);
+
+ Assert.assertEquals("datasets", parsed5.getRepoType());
+ Assert.assertEquals("fka/awesome-chatgpt-prompts",
parsed5.getRepository());
+ Assert.assertEquals("main", parsed5.getRevision());
+ Assert.assertEquals("/prompts.csv", parsed5.getPath());
+
+ // Test URL with HuggingFace web interface format (/tree/v1.0/)
+ String url6 = "hf://datasets/user/dataset/tree/v1.0/data/file.txt";
+ ParsedHFUrl parsed6 = HFUtils.parseHfUrl(url6);
+
+ Assert.assertEquals("datasets", parsed6.getRepoType());
+ Assert.assertEquals("user/dataset", parsed6.getRepository());
+ Assert.assertEquals("v1.0", parsed6.getRevision());
+ Assert.assertEquals("/data/file.txt", parsed6.getPath());
+ }
+
+ @Test
+ public void testHttpUrlConversion() throws AnalysisException {
+ // Test basic conversion
+ String hfUrl1 =
"hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+ String httpUrl1 = HFUtils.convertHfUrlToHttpUrl(hfUrl1);
+ String expected1 =
"https://huggingface.co/datasets/lhoestq/demo1/resolve/main/default/train/0000.parquet";
+ Assert.assertEquals(expected1, httpUrl1);
+
+ // Test conversion with revision
+ String hfUrl2 = "hf://datasets/username/[email protected]/path/to/file.csv";
+ String httpUrl2 = HFUtils.convertHfUrlToHttpUrl(hfUrl2);
+ String expected2 =
"https://huggingface.co/datasets/username/dataset/resolve/v1.0/path/to/file.csv";
+ Assert.assertEquals(expected2, httpUrl2);
+
+ // Test spaces conversion
+ String hfUrl3 = "hf://spaces/gradio/calculator/app.py";
+ String httpUrl3 = HFUtils.convertHfUrlToHttpUrl(hfUrl3);
+ String expected3 =
"https://huggingface.co/spaces/gradio/calculator/resolve/main/app.py";
+ Assert.assertEquals(expected3, httpUrl3);
+
+ // Test HuggingFace web interface format conversion
+ String hfUrl4 =
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv";
+ String httpUrl4 = HFUtils.convertHfUrlToHttpUrl(hfUrl4);
+ String expected4 =
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv";
+ Assert.assertEquals(expected4, httpUrl4);
+ }
+
+ @Test
+ public void testTreeApiUrlGeneration() throws AnalysisException {
+ String hfUrl = "hf://datasets/lhoestq/demo1/default/train";
+ ParsedHFUrl parsed = HFUtils.parseHfUrl(hfUrl);
+
+ // Test without limit
+ String treeUrl1 = HFUtils.buildTreeApiUrl(parsed, 0);
+ String expected1 =
"https://huggingface.co/api/datasets/lhoestq/demo1/tree/main/default/train";
+ Assert.assertEquals(expected1, treeUrl1);
+
+ // Test with limit
+ String treeUrl2 = HFUtils.buildTreeApiUrl(parsed, 100);
+ String expected2 =
"https://huggingface.co/api/datasets/lhoestq/demo1/tree/main/default/train?limit=100";
+ Assert.assertEquals(expected2, treeUrl2);
+ }
+
+ @Test
+ public void testRepositoryInfo() throws AnalysisException {
+ String hfUrl1 =
"hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+ String repoInfo1 = HFUtils.getRepositoryInfo(hfUrl1);
+ Assert.assertEquals("datasets/lhoestq/demo1@main", repoInfo1);
+
+ String hfUrl2 = "hf://datasets/username/[email protected]/path/to/file.csv";
+ String repoInfo2 = HFUtils.getRepositoryInfo(hfUrl2);
+ Assert.assertEquals("datasets/username/[email protected]", repoInfo2);
+ }
+
+ @Test
+ public void testValidHfUrlValidation() {
+ // Valid URLs
+
Assert.assertTrue(HFUtils.isValidHfUrl("hf://datasets/user/repo/file.txt"));
+
Assert.assertTrue(HFUtils.isValidHfUrl("hf://spaces/user/space/app.py"));
+
Assert.assertTrue(HFUtils.isValidHfUrl("hf://datasets/user/[email protected]/file.txt"));
+
+ // Invalid URLs
+ Assert.assertFalse(HFUtils.isValidHfUrl(null));
+ Assert.assertFalse(HFUtils.isValidHfUrl(""));
+ Assert.assertFalse(HFUtils.isValidHfUrl("http://example.com"));
+ Assert.assertFalse(HFUtils.isValidHfUrl("hf://"));
+ Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets"));
+ Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets/"));
+ Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets/user"));
+ Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets/user/repo"));
// Missing path
+
Assert.assertFalse(HFUtils.isValidHfUrl("hf://invalid/user/repo/file.txt"));
+ }
+
+ @Test
+ public void testInvalidUrlExceptions() {
+ // Test null/empty URL
+ try {
+ HFUtils.parseHfUrl(null);
+ Assert.fail("Should throw AnalysisException for null URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("cannot be null or
empty"));
+ }
+
+ try {
+ HFUtils.parseHfUrl("");
+ Assert.fail("Should throw AnalysisException for empty URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("cannot be null or
empty"));
+ }
+
+ // Test non-hf URL
+ try {
+ HFUtils.parseHfUrl("http://example.com");
+ Assert.fail("Should throw AnalysisException for non-hf URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("must start with
'hf://'"));
+ }
+
+ // Test incomplete URL
+ try {
+ HFUtils.parseHfUrl("hf://datasets");
+ Assert.fail("Should throw AnalysisException for incomplete URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("Failed to parse
HuggingFace URL"));
+ }
+
+ // Test invalid repository type
+ try {
+ HFUtils.parseHfUrl("hf://models/user/model/file.txt");
+ Assert.fail("Should throw AnalysisException for unsupported repo
type");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("only supports
'datasets' and 'spaces'"));
+ }
+
+ // Test empty username
+ try {
+ HFUtils.parseHfUrl("hf://datasets//repo/file.txt");
+ Assert.fail("Should throw AnalysisException for empty username");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("Failed to parse
HuggingFace URL"));
+ }
+
+ // Test empty revision
+ try {
+ HFUtils.parseHfUrl("hf://datasets/user/repo@/file.txt");
+ Assert.fail("Should throw AnalysisException for empty revision");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("Failed to parse
HuggingFace URL"));
+ }
+ }
+
+ @Test
+ public void testConvertHfUrlToHttpUrlExceptions() {
+ // Test null URL
+ try {
+ HFUtils.convertHfUrlToHttpUrl(null);
+ Assert.fail("Should throw AnalysisException for null URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("cannot be null or
empty"));
+ }
+
+ // Test empty URL
+ try {
+ HFUtils.convertHfUrlToHttpUrl("");
+ Assert.fail("Should throw AnalysisException for empty URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("cannot be null or
empty"));
+ }
+
+ // Test invalid URL
+ try {
+ HFUtils.convertHfUrlToHttpUrl("http://example.com");
+ Assert.fail("Should throw AnalysisException for invalid URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("must start with
'hf://'"));
+ }
+ }
+
+ @Test
+ public void testEdgeCases() throws AnalysisException {
+ // Test URL with special characters in path
+ String hfUrl1 = "hf://datasets/user/repo/path with
spaces/file-name_123.parquet";
+ String httpUrl1 = HFUtils.convertHfUrlToHttpUrl(hfUrl1);
+ String expected1 =
"https://huggingface.co/datasets/user/repo/resolve/main/path with
spaces/file-name_123.parquet";
+ Assert.assertEquals(expected1, httpUrl1);
+
+ // Test URL with multiple slashes in path
+ String hfUrl2 = "hf://datasets/user/repo/path/to/deep/nested/file.txt";
+ String httpUrl2 = HFUtils.convertHfUrlToHttpUrl(hfUrl2);
+ String expected2 =
"https://huggingface.co/datasets/user/repo/resolve/main/path/to/deep/nested/file.txt";
+ Assert.assertEquals(expected2, httpUrl2);
+
+ // Test URL with revision containing special characters
+ String hfUrl3 = "hf://datasets/user/[email protected]/file.txt";
+ String httpUrl3 = HFUtils.convertHfUrlToHttpUrl(hfUrl3);
+ String expected3 =
"https://huggingface.co/datasets/user/repo/resolve/feature-branch-v1.0/file.txt";
+ Assert.assertEquals(expected3, httpUrl3);
+ }
+
+ @Test
+ public void testGlobFunctionality() throws AnalysisException {
+ // Test wildcard detection
+ Assert.assertTrue(HFUtils.containsWildcards("/path/*.parquet"));
+ Assert.assertTrue(HFUtils.containsWildcards("/path/**/train/*.csv"));
+ Assert.assertTrue(HFUtils.containsWildcards("/path/file_[abc].txt"));
+ Assert.assertTrue(HFUtils.containsWildcards("/path/file_{1,2,3}.txt"));
+ Assert.assertFalse(HFUtils.containsWildcards("/path/file.txt"));
+ Assert.assertFalse(HFUtils.containsWildcards(""));
+ Assert.assertFalse(HFUtils.containsWildcards(null));
+
+ // Test longest prefix extraction
+ Assert.assertEquals("/path",
HFUtils.getLongestPrefixWithoutWildcards("/path/*.parquet"));
+ Assert.assertEquals("/path",
HFUtils.getLongestPrefixWithoutWildcards("/path/**/train/*.csv"));
+ Assert.assertEquals("/path",
HFUtils.getLongestPrefixWithoutWildcards("/path/file_[abc].txt"));
+ Assert.assertEquals("/path/to/deep",
HFUtils.getLongestPrefixWithoutWildcards("/path/to/deep/*.txt"));
+ Assert.assertEquals("/path/file.txt",
HFUtils.getLongestPrefixWithoutWildcards("/path/file.txt"));
+ Assert.assertEquals("",
HFUtils.getLongestPrefixWithoutWildcards("*.txt"));
+
+ // Test glob URL validation
+
Assert.assertTrue(HFUtils.isValidGlobUrl("hf://datasets/user/repo/path/*.parquet"));
+
Assert.assertTrue(HFUtils.isValidGlobUrl("hf://datasets/user/repo/path/**/train/*.csv"));
+
Assert.assertFalse(HFUtils.isValidGlobUrl("hf://datasets/user/repo/path/file.txt"));
+ Assert.assertFalse(HFUtils.isValidGlobUrl("http://example.com/*.txt"));
+ Assert.assertFalse(HFUtils.isValidGlobUrl(null));
+ }
+
+ @Test
+ public void testGlobPatternMatching() {
+ // Test basic pattern matching
+ Assert.assertTrue(HFUtils.matchGlobPattern("file.txt", "*.txt"));
+ Assert.assertTrue(HFUtils.matchGlobPattern("file.parquet",
"*.parquet"));
+ Assert.assertTrue(HFUtils.matchGlobPattern("file_a.txt",
"file_[abc].txt"));
+ Assert.assertFalse(HFUtils.matchGlobPattern("file_d.txt",
"file_[abc].txt"));
+ Assert.assertFalse(HFUtils.matchGlobPattern("file.csv", "*.txt"));
+
+ // Test edge cases
+ Assert.assertFalse(HFUtils.matchGlobPattern(null, "*.txt"));
+ Assert.assertFalse(HFUtils.matchGlobPattern("file.txt", null));
+ Assert.assertFalse(HFUtils.matchGlobPattern("", "*.txt"));
+ }
+
+ @Test
+ public void testPathSplitting() {
+ List<String> components1 = HFUtils.splitPath("/path/to/file.txt");
+ Assert.assertEquals(3, components1.size());
+ Assert.assertEquals("path", components1.get(0));
+ Assert.assertEquals("to", components1.get(1));
+ Assert.assertEquals("file.txt", components1.get(2));
+
+ List<String> components2 = HFUtils.splitPath("path/to/file.txt");
+ Assert.assertEquals(3, components2.size());
+ Assert.assertEquals("path", components2.get(0));
+
+ List<String> components3 = HFUtils.splitPath("");
+ Assert.assertEquals(0, components3.size());
+
+ List<String> components4 = HFUtils.splitPath(null);
+ Assert.assertEquals(0, components4.size());
+ }
+
+ @Test
+ public void testAdvancedPatternMatching() {
+ // Test ** recursive matching
+ List<String> pathComponents1 =
HFUtils.splitPath("path/to/deep/file.txt");
+ List<String> patternComponents1 =
HFUtils.splitPath("path/**/file.txt");
+ Assert.assertTrue(HFUtils.matchPathComponents(pathComponents1,
patternComponents1));
+
+ List<String> pathComponents2 = HFUtils.splitPath("path/file.txt");
+ List<String> patternComponents2 =
HFUtils.splitPath("path/**/file.txt");
+ Assert.assertTrue(HFUtils.matchPathComponents(pathComponents2,
patternComponents2));
+
+ List<String> pathComponents3 = HFUtils.splitPath("different/file.txt");
+ List<String> patternComponents3 =
HFUtils.splitPath("path/**/file.txt");
+ Assert.assertFalse(HFUtils.matchPathComponents(pathComponents3,
patternComponents3));
+
+ // Test single * matching
+ List<String> pathComponents4 =
HFUtils.splitPath("path/train/file.txt");
+ List<String> patternComponents4 = HFUtils.splitPath("path/*/file.txt");
+ Assert.assertTrue(HFUtils.matchPathComponents(pathComponents4,
patternComponents4));
+
+ List<String> pathComponents5 =
HFUtils.splitPath("path/to/deep/file.txt");
+ List<String> patternComponents5 = HFUtils.splitPath("path/*/file.txt");
+ Assert.assertFalse(HFUtils.matchPathComponents(pathComponents5,
patternComponents5));
+ }
+
+ @Test
+ public void testGlobExpansion() throws AnalysisException {
+ // Test non-glob URL (should return single result)
+ String nonGlobUrl = "hf://datasets/user/repo/path/file.txt";
+ List<String> result1 = HFUtils.expandGlob(nonGlobUrl);
+ Assert.assertEquals(1, result1.size());
+
Assert.assertEquals("https://huggingface.co/datasets/user/repo/resolve/main/path/file.txt",
result1.get(0));
+
+ // Test glob URL validation
+ String globUrl1 = "hf://datasets/user/repo/path/*.parquet";
+ Assert.assertTrue(HFUtils.isValidGlobUrl(globUrl1));
+
+ String globUrl2 = "hf://datasets/user/repo/path/*.csv";
+ Assert.assertTrue(HFUtils.isValidGlobUrl(globUrl2));
+
+ // Note: Real glob expansion tests would require actual HuggingFace
API calls
+ // The actual expansion will fail without real API access, but URL
parsing works
+ }
+
+ @Test
+ public void testGlobExpansionExceptions() throws AnalysisException {
+ // Test null URL
+ try {
+ HFUtils.expandGlob(null);
+ Assert.fail("Should throw AnalysisException for null URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("cannot be null or
empty"));
+ }
+
+ // Test empty URL
+ try {
+ HFUtils.expandGlob("");
+ Assert.fail("Should throw AnalysisException for empty URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("cannot be null or
empty"));
+ }
+
+ // Test invalid URL
+ try {
+ HFUtils.expandGlob("http://example.com/*.txt");
+ Assert.fail("Should throw AnalysisException for invalid URL");
+ } catch (AnalysisException e) {
+ Assert.assertTrue(e.getMessage().contains("must start with
'hf://'"));
+ }
+
+ List<String> res =
HFUtils.expandGlob("hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv");
+ Assert.assertEquals(1, res.size());
+
Assert.assertEquals("https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv",
+ res.get(0));
+
+ ParsedHFUrl parsed =
HFUtils.parseHfUrl("hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv");
+ Assert.assertEquals("/prompts.csv", parsed.getPath());
+
+ res =
HFUtils.expandGlob("hf://datasets/fka/awesome-chatgpt-prompts/blob/main/*");
+ Assert.assertEquals(3, res.size());
+ Assert.assertTrue(res.contains(
+
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv"));
+ Assert.assertTrue(res.contains(
+
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/.gitattributes"));
+ Assert.assertTrue(res.contains(
+
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/README.md"));
+ }
+}
+
diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift
index 62613e4c760..0b1cc973a28 100644
--- a/gensrc/thrift/Types.thrift
+++ b/gensrc/thrift/Types.thrift
@@ -693,6 +693,7 @@ enum TFileType {
FILE_S3 = 3,
FILE_HDFS = 4,
FILE_NET = 5, // read file by network, such as http
+ FILE_HTTP = 6,
}
struct TTabletCommitInfo {
diff --git a/regression-test/data/external_table_p0/tvf/test_http_tvf.out
b/regression-test/data/external_table_p0/tvf/test_http_tvf.out
new file mode 100644
index 00000000000..891ce36d02e
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/test_http_tvf.out
@@ -0,0 +1,176 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql01 --
+0 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+1 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+10 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+100 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+1000 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+1001 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+1002 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+1003 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+1004 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+1005 2 3 4 5 6.6 7.7 8.8 abc def
ghiaaaaaa 2020-10-10 2020-10-10 11:12:59
+
+-- !sql02 --
+2500
+
+-- !sql03 --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+c3 text Yes false \N NONE
+c4 text Yes false \N NONE
+c5 text Yes false \N NONE
+c6 text Yes false \N NONE
+c7 text Yes false \N NONE
+c8 text Yes false \N NONE
+c9 text Yes false \N NONE
+c10 text Yes false \N NONE
+c11 text Yes false \N NONE
+c12 text Yes false \N NONE
+c13 text Yes false \N NONE
+
+-- !sql04 --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+c3 text Yes false \N NONE
+c4 text Yes false \N NONE
+c5 text Yes false \N NONE
+c6 text Yes false \N NONE
+c7 text Yes false \N NONE
+c8 text Yes false \N NONE
+c9 text Yes false \N NONE
+c10 text Yes false \N NONE
+c11 text Yes false \N NONE
+c12 text Yes false \N NONE
+c13 text Yes false \N NONE
+
+-- !sql05 --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+c3 text Yes false \N NONE
+c4 text Yes false \N NONE
+c5 text Yes false \N NONE
+c6 text Yes false \N NONE
+c7 text Yes false \N NONE
+c8 text Yes false \N NONE
+c9 text Yes false \N NONE
+c10 text Yes false \N NONE
+c11 text Yes false \N NONE
+c12 text Yes false \N NONE
+c13 text Yes false \N NONE
+
+-- !sql05 --
+2500
+
+-- !sql06 --
+20
+
+-- !sql07 --
+k00 text Yes false \N NONE
+k01 text Yes false \N NONE
+k02 text Yes false \N NONE
+k03 text Yes false \N NONE
+k04 text Yes false \N NONE
+k05 text Yes false \N NONE
+k06 text Yes false \N NONE
+k07 text Yes false \N NONE
+k08 text Yes false \N NONE
+k09 text Yes false \N NONE
+k10 text Yes false \N NONE
+k11 text Yes false \N NONE
+k12 text Yes false \N NONE
+k13 text Yes false \N NONE
+k14 text Yes false \N NONE
+k15 text Yes false \N NONE
+k16 text Yes false \N NONE
+k17 text Yes false \N NONE
+k18 text Yes false \N NONE
+
+-- !sql08 --
+25 [["pVzcKC-4YFR2VM-hAF-4wbj", null, "puPe8Y-CvN1o8z-YDW-956F",
"NpRzsr-8KGoqbr-RnS-gmVb", "7J1bbm-vPRco5H-HyR-jLff"],
["F1C8O5-JBIfHix-br3-L3a4", null, "eb6vio-XsxJ5Sk-bFE-PbYB",
"ElNPdg-za24mCK-LeD-cN7E", "oJLv7H-elMwuV7-TZT-XWEe"],
["Ft5ADO-0LrvGT7-vXJ-bb7b", "DEMymk-WDCqA2c-aGK-hC1m",
"jxVnmH-k0M7iQl-tzz-M1e4", "0PRgjU-MY7jnay-qWv-rkyg", null], null,
["jXsrIF-1FnfVfP-wV6-u2kr", "Omp5zc-O5RLdRO-5Ql-UG6u",
"NBb9Cn-x2RW6KT-CHD-p3wA", null, "tsYVut-EigOUzE-Lle-Hs14"]]
[{"5TmJmI-3HVmy0t- [...]
+26 [["1cLgOq-jhNeMEG-Dtw-4AwL", "jhZcsW-CGyj1kt-sQ7-0aJX", null,
"VdQfoU-hrZt0zV-sO1-tsWp", "wn3kwP-lB1AxGC-epk-VD8u"],
["3xsktg-6bFiUt4-Q7u-Bi9v", null, "ucSLCY-DJ0zx8j-9yj-2lEA",
"8ltbUA-bOjtDdV-Ojs-smeQ", "unUDj7-FBicSrt-QwN-95uj"], [null,
"sfGvVX-smGcvy2-h8W-BYsm", "c6HKrq-XH4VGV6-64O-vyKV",
"i5a7tM-CFYAieL-WJ8-ZPvH", "7i2MN1-rvPWCl7-s2Y-xfY7"], null,
["9o5TWr-Eh4n0uh-gNz-eAmq", "qC7TXd-IwtcLU8-hke-NE37", null,
"cResuY-IsHEewt-YJq-2Xu5", "zWZBBW-PXIPZnq-S5Y-OhDC"]] [{"xWMxf4-uFVGZNe- [...]
+27 [null, [null, "nspd8V-YHBG4C6-Tvf-gX5i"]]
[{"0N4Qgs-iPTaGkG-N4d-vXRb":"2023-08-13"},
{"R2XNFG-TjXu1Bi-fUL-RREK":"2023-08-13"},
{"lsBVJL-XNm8KEw-c5e-B3iY":"2023-08-13"}] [{"vin":"OCL5yv-CbFRH8z-UBm-Onqa",
"charge_id":6}, {"vin":"yNOKGc-ogGNyrp-rEM-TbWL", "charge_id":7},
{"vin":"V6LRT5-24MfA4k-8BD-O9gL", "charge_id":7},
{"vin":"249uKy-454ywiX-rhs-xHKf", "charge_id":6}]
{"fVXa0E-xsoMkYX-XVM-pwu2":{"s5WU6X-qzlKKdR-t8a-6fIG":0.5003744327257579,
"VGJxRD-s6C6zPQ-1AX-kyIa":0.522556668127189, [...]
+28 [[null, "MBuONO-DddJpWd-YYg-MSAG", "loWxFz-BPZeiDk-xN0-SVZW",
"HwiB4h-EDkkufQ-Zg5-9WpH"], null, [null, "Mfs6G6-e4Kh5u5-rUC-F1uW",
"1F2nID-KF9Lshh-Zav-ptfh", "mBDoQl-U8oYBUE-zu0-Bi58"],
["En2b2G-QTh1FPR-F2J-h7uw", null, "VicBSI-eTAFJcg-Fo2-VYaJ",
"6ejh2T-VZkvqPv-7gz-hnrd"]] [{"9odUAi-6Yl0gDO-9au-9TTI":"2023-08-13"},
{"Raj6No-gizgyb2-4qO-5UHu":"2023-08-13"},
{"HQDXNK-uqxiy4r-ksG-6ktV":"2023-08-13"},
{"suGkbd-dAFaNhh-wId-w6K6":"2023-08-13"},
{"i6y7h1-GVZnnd8-8eh-LyvR":"2023-08-13"}, {"Xi [...]
+29 [null, ["VXefdi-tyIlR8m-gCC-AjnK", "KoBsFd-VQFrvFF-s2O-uFze",
"R5HVUm-CeXPCkn-Ydv-X0O5", null], ["yczNuG-kFoLCzw-wVK-m0EL",
"ezESaZ-dYgklBS-irV-4dID", null, "2mxKdq-bmO6fIw-bb7-H0yE"],
["1thl5g-XaBs3HE-bHS-etgP", "M60GP9-7SqC0Kz-kBo-cXu8",
"cUkthX-Ap78LOk-8HW-vayb", null]]
[{"3Gp7Yq-JCwRlJ8-O2C-6fjg":"2023-08-13"},
{"7OSwlc-jsn2rty-VYy-F54v":"2023-08-13"},
{"zZTFUa-cDGEEAm-QsO-C9q5":"2023-08-13"},
{"FvtwJc-eyNEZ1d-dx0-nCGb":"2023-08-13"},
{"dhBtg6-5wnSN1w-VlW-A6xy":"2023-08-13"}] [{"v [...]
+30 [["DLwzZm-gnCo8o3-pWp-5GIu", "Phagik-6Ns5YUc-q7t-JJRf",
"89PSpv-T96ZR11-KkS-bi2u", "TsJ1tS-RoR6Ha0-0yo-COyr", null],
["BtELv1-FAJC3cM-lAO-xzFF", "Rq2YzY-f2RI924-fI1-tKjr",
"Zhkuyu-3ppFLv4-qZ0-ftCH", null, "0yf8lS-MWIpBjA-mVm-OTXR"],
["PAbA8p-ygabPZ6-zv0-XbXa", "PsTnTE-JakUfJF-9gt-zGG9",
"5Tx6bc-ZntvYXp-w0V-bzaz", null, "JZoZ4q-9a1RKo7-8MW-bjKX"], null,
["roqBXP-0Cop9U0-S4T-9NCz", "VnRnDi-3yB4YpD-ZyO-Rcmv",
"Ue4v3i-g6xJfLo-k49-zcg6", "mQrVTO-ptLLBIT-CKo-YrTb", null]]
[{"O6jI0T-vYIfPmb- [...]
+31 [null, ["hxE9tP-ES0ywas-9ve-qyDT", null, "GHZxzf-e6mqxJ2-BV2-0rdf",
"4XDBD4-hkvkBdZ-0Kg-sZpW", "d8Giqq-XEZgQxA-u2F-Zkwb"],
["8lUWqZ-i1SctAd-bZy-sAuw", "cETLpP-8GA5jmM-GEL-CWZI", null,
"QjKEUM-VN7ULGe-LmF-ZCe1", "gGtRa4-eWdxr5C-Y4V-E8Nz"],
["Yg8ky6-8ds0ob0-m75-AqcM", "XXaG3x-HTcSCaT-Nl2-FT2z",
"PUgDQx-tHwlMpI-60c-Nsk6", "2oWo0G-j94Wbdb-rtm-0zKD", null],
["5hLeYN-Dk3SrI6-tsM-npdb", null, "AXqiTT-8jTmWDW-HFN-zNG9",
"szQhr9-9qRIj7N-xen-midZ", "nKGe2u-nyuSf4D-Qlx-yU4L"]] [{"W3obeg-hTdQjq7- [...]
+32 [["PvEAGq-5J3ydw2-SCg-6Sm5", "bi7WP1-rj3rsXL-0CR-g7jK",
"bRG52A-t7PBOaQ-L2Q-KecR", "UgbKVA-roWdAzr-wHq-uRg4", null], null,
["v6eqEI-XTWA5Xj-aP4-EUAw", "GvgD7n-iegZ942-6d2-tPWV", null,
"NmMoiy-kdyanyV-wb9-6EZ2", "qqUWob-5BJnc4p-zKl-PmVw"],
["XIyb7J-PeYlc9D-eWN-7GLi", null, "x5OCi3-IGBxmlM-dCg-n59q",
"5HVLto-4vnWy0L-jFN-g2LF", "HJXFed-KSQM50g-alH-cCd9"],
["91Ozzg-qMRSn4U-g84-OQE5", "qWRp5Q-gSSpT2J-hgd-fmy9",
"GnXfEv-cVXlH4H-vvo-INHz", "OebFDG-UNRLMKM-gSl-czBx", null]]
[{"3joUkd-Zy6qqZX- [...]
+33 [["Alu6Ut-pym9uxc-f9V-VZtc", "dxfkfq-wSWY1mf-MCy-C6RL",
"jvIHsc-56opIJg-DVE-3j4e", null, "1HIy6L-HDosxgv-beC-TczB",
"aQ1Vl7-QlLYZUN-MpB-b8XI"], null, ["vOiBV4-c9k237Z-MM3-6Sgp",
"JW1UYX-TSFOjWq-Tu4-LSJz", "P5fgzE-nmTunS2-yr3-kEqK",
"eH5hjn-QWWrDXc-1vp-PmzW", "m20eP6-eZmU2Cj-qkH-Anc2", null],
["Gd5uCH-doe3Mf9-Af5-KS6J", "FylE74-etOFPlN-kFw-jjuq", null,
"ZUc1uK-k2KJtYK-y1p-ARSo", "nzBq5J-56Xov73-6xO-VI8Q",
"xoSzVC-dlhJgZh-XpI-iABo"], ["5PmSbt-SYpRG3P-Rkm-BZK7",
"CcuuXB-CoIOZdt-xGZ-zlAy" [...]
+34 [["jUSz7U-Cer18Xx-yVx-T8ff", "m2co1R-yrE6xgn-Unr-stw1", null], null,
["OuQ3zk-V64e9tJ-uBm-PvWT", "nEqM5q-xKo1pcE-S6M-lbAk", null]]
[{"l5rFuO-4Qn7arh-0Mu-45p5":"2023-08-13"},
{"ALYX0U-y6GeYN2-31y-FqWv":"2023-08-13"},
{"vi9Mo6-Nf8VTVB-1Tv-MvTG":"2023-08-13"},
{"DkcoGP-OXrLStI-sXX-KQ8l":"2023-08-13"}]
[{"vin":"H3NJs9-XFM27Mq-E60-gErD", "charge_id":3},
{"vin":"1rHzwj-nWluuUa-D5c-SY6Q", "charge_id":4},
{"vin":"0AUoC3-OxtVnr6-2mX-LkEB", "charge_id":0},
{"vin":"QTGWJr-iXuPdTY-lkI-6ju5", "cha [...]
+
+-- !sql09 --
+[{"5TmJmI-3HVmy0t-AZJ-49FX":"2023-08-13"},
{"0AG1xf-dy1RcNW-Ped-td4S":"2023-08-13"},
{"nLrqP3-SwoheqC-CEy-8XoO":"2023-08-13"},
{"V5QJNN-TG06d1z-Ivq-x1Rq":"2023-08-13"},
{"aG5O5A-ycB4pDt-N3o-uX6i":"2023-08-13"},
{"VYdsqd-aVLhQ9L-UmK-2xaz":"2023-08-13"},
{"N6Cw0y-Jb45TsT-1eS-ok4f":"2023-08-13"},
{"GtjDT5-ydD7TwS-jfM-UN23":"2023-08-13"}] 25
+[{"xWMxf4-uFVGZNe-YA7-eAau":"2023-08-13"},
{"DRmO1m-NOIjiU7-9rY-vgNY":"2023-08-13"},
{"lAZgMl-JE2DNvX-LsV-80Ip":"2023-08-13"},
{"3BMdOY-epaTDKh-ykC-Biq0":"2023-08-13"},
{"75wIx6-8tIELFt-9J1-0H0p":"2023-08-13"}] 26
+[{"0N4Qgs-iPTaGkG-N4d-vXRb":"2023-08-13"},
{"R2XNFG-TjXu1Bi-fUL-RREK":"2023-08-13"},
{"lsBVJL-XNm8KEw-c5e-B3iY":"2023-08-13"}] 27
+[{"9odUAi-6Yl0gDO-9au-9TTI":"2023-08-13"},
{"Raj6No-gizgyb2-4qO-5UHu":"2023-08-13"},
{"HQDXNK-uqxiy4r-ksG-6ktV":"2023-08-13"},
{"suGkbd-dAFaNhh-wId-w6K6":"2023-08-13"},
{"i6y7h1-GVZnnd8-8eh-LyvR":"2023-08-13"},
{"XiZxt7-NYBhXzc-GtE-y1dN":"2023-08-13"},
{"cRQTEh-nwhZErS-Nr5-Fjmr":"2023-08-13"}] 28
+[{"3Gp7Yq-JCwRlJ8-O2C-6fjg":"2023-08-13"},
{"7OSwlc-jsn2rty-VYy-F54v":"2023-08-13"},
{"zZTFUa-cDGEEAm-QsO-C9q5":"2023-08-13"},
{"FvtwJc-eyNEZ1d-dx0-nCGb":"2023-08-13"},
{"dhBtg6-5wnSN1w-VlW-A6xy":"2023-08-13"}] 29
+[{"O6jI0T-vYIfPmb-rej-lw2p":"2023-08-13"},
{"Crykzj-vqnbDR5-KZE-1fiC":"2023-08-13"},
{"ljhnru-uLcDSlv-XWX-j3Tm":"2023-08-13"},
{"K2Xi6I-WLl9Saj-KlJ-MDtG":"2023-08-13"},
{"MMC92c-mJdccLa-YzF-JGfa":"2023-08-13"},
{"SXS2ml-4gLVQ1A-S7q-oRTQ":"2023-08-13"},
{"u0vvu4-gUVPuGE-rPG-sbTu":"2023-08-13"}] 30
+[{"W3obeg-hTdQjq7-Oto-wQem":"2023-08-13"},
{"w9VzsI-NFpuORO-30P-8VZY":"2023-08-13"},
{"A8A0Ep-IQyfVhW-TQQ-plXt":"2023-08-13"},
{"6IhomZ-52jOtZh-g3Z-bXuJ":"2023-08-13"},
{"XE7tMX-36iVM10-BpD-bwMc":"2023-08-13"}] 31
+[{"3joUkd-Zy6qqZX-HIC-LBCI":"2023-08-13"},
{"3Ntsk5-uTDXEL3-j1t-JvCp":"2023-08-13"},
{"Ed7N57-xcErXkc-dnm-18qP":"2023-08-13"},
{"HUoBEV-PkJI7Cv-mUQ-2jrR":"2023-08-13"},
{"JDZ5eW-q7DY0t9-XO8-ryKw":"2023-08-13"},
{"DjFP33-KmnbRsG-ukV-cbrD":"2023-08-13"}] 32
+[{"5tn1K4-fMUCzvZ-JMT-I0fB":"2023-08-13"},
{"qS8FoC-NjEIzin-Fyn-i2Pf":"2023-08-13"},
{"l9LtTo-wFC0tjJ-YXT-vEtB":"2023-08-13"},
{"7vON38-aB2GA3i-B2w-rY2x":"2023-08-13"},
{"MQ7vz3-aQ9a0EZ-5W0-OsFD":"2023-08-13"},
{"inQqA5-a3GpVBp-Brd-q1OV":"2023-08-13"},
{"yjvpW3-whi47LZ-E37-Fq2e":"2023-08-13"}] 33
+[{"l5rFuO-4Qn7arh-0Mu-45p5":"2023-08-13"},
{"ALYX0U-y6GeYN2-31y-FqWv":"2023-08-13"},
{"vi9Mo6-Nf8VTVB-1Tv-MvTG":"2023-08-13"},
{"DkcoGP-OXrLStI-sXX-KQ8l":"2023-08-13"}] 34
+
+-- !sql10 --
+id int Yes false \N NONE
+arr_arr array<array<text>> Yes false \N NONE
+arr_map array<map<text,date>> Yes false \N NONE
+arr_struct array<struct<vin:text,charge_id:int>> Yes false \N
NONE
+map_map map<text,map<text,double>> Yes false \N NONE
+map_arr map<int,array<double>> Yes false \N NONE
+map_struct
map<datetime(6),struct<vin:text,charge_id:int,start_time:double>> Yes
false \N NONE
+struct_arr_map struct<aa:array<text>,mm:map<date,text>> Yes false
\N NONE
+
+-- !sql11 --
+{"5kRblH-cq4ElSG-qui-NNog":"OnU5uR-13DpZEM-azK-P1AE",
"Vfiafd-4kxIoQm-ji5-Nb8n":"dznBUH-cP9ww01-QvQ-7gfm",
"X0rjvB-TQ7rvzt-O6F-US5c":"YcqBZA-xscceEm-LGh-DoZr",
"riZBcx-kjRGLAT-pmQ-JKfL":"8rSFPX-ycfitem-5no-yikq"} 0
+{"1nybuE-Rf1unch-I5z-oLjz":"x8AF5t-wx1I2Ro-RD9-qoCQ",
"2XGrXq-YexqEiQ-VCE-0jwl":"xJY6Nj-KJZEMG9-VyM-Posk",
"3Vh5kU-zjgHF62-CrE-PSLC":"ki3Swb-MtmCpdf-R2l-Ugq0",
"BJwk9c-oHa7kTQ-Bdq-jt01":"IVatWf-YZGfujf-tSI-K9Gd",
"CMdMYR-aSuYPDm-uja-LEll":"JzwHPd-3zTnWx1-23i-VrUX",
"HNYc3t-qUcRi5c-OSM-zyhe":"wyvzTC-SbcLMq2-RPG-uvab",
"KX2ZS0-YbYRmC8-6rh-hdbH":"Qisdjq-p8R9GkZ-gCB-Beu8",
"Lb61yB-xzY68DL-hgd-CIb3":"g0FdDK-lkwWssW-gOt-zC1M",
"LbMCG6-p2vJtbI-Rmb-Jtqk":"9mCHS9-Pl85VW8-Tks-WNz2", "Lnjdhw-rzHaxJ [...]
+{"AU99kT-ZQZ0RAq-RA2-f0Tz":"u9W7OS-fFaZ34X-ioi-9JvB",
"FqcqWE-YwTtNRe-kp2-9CnR":"nMS5I2-nMuPLhD-isT-Xzsg",
"Lv0rRz-RUTz5Z7-W4B-NT18":"M0f065-mDJidvY-46Q-nSZ2",
"VSPxZK-c4cj66k-EYw-JnoM":"HSMPX3-yGJEvjB-xxy-mCcE",
"Vo2O4g-kcT14KR-VYm-Q1jv":"rBS2MG-0BXCtpU-OPL-jp3t",
"aLQLrD-gAHzact-Ca1-p5kt":"1CJ0sn-Ja8yPwW-byx-H1mP",
"amXhSp-VTyggqP-7vc-bNkp":"EFfs5D-sA8rn8Y-2s4-3Z5r",
"m0DbNi-Ldumv2W-Rp4-s4Ho":"oRAmyA-yoGbZIM-3w1-4W2X",
"ukdPPj-m4XorVi-09g-Cag7":"enOqzd-C4ucklT-EZ5-Gpj3", "y9pfS0-KWz8BP [...]
+{"4Shnpr-QG4n30n-K5m-qxFM":"bmyad7-jrJpoyV-gpB-zk7X",
"KZQeFF-UAeo0an-PHS-j8Kc":"22wd40-1Kzu1tK-tDv-URiX",
"SRnN8p-JBgAfei-QrR-k42a":"WmLAYu-yKW2rSd-XsK-hOKo",
"Z31dFq-tMYzAFr-CF9-k9r4":"rWR3X7-USwcvvP-aVA-61do",
"dY2oEN-dSBzeec-dx8-IPi8":"qV1dEj-MrPHPvr-1p3-0WDQ"} 3
+{"0nO5Ha-oQpanmE-vjr-wBz9":"ke5lfu-0TtpnSs-DzS-c1gC",
"1mPQt5-N6ejgq2-8k9-qxQI":"U5gvPP-03zzaTX-MNg-78tc",
"4f8yq6-UEmVDdY-O7t-cWM8":"w66Vgt-RHCwhET-vT7-3Xfm",
"4ij3Bo-LRn8rNA-RPj-pmCk":"NpJNko-zfD2kji-Qpo-gdf7",
"C6uxM3-1HqfKJq-qF8-wU6X":"Hp2dh3-CWf3fUn-yfV-Lwg4",
"CFk7jl-59cfmRQ-VLU-OeYE":"gDQNNV-LILuKC3-65e-ECnd",
"IJAhrL-Yez3u2u-aUh-ZPxb":"P0z9FS-S0kuFkQ-P4m-2C8T",
"ISH7Wx-xe5yk41-ZKV-W3bw":"XQkTKO-SL91HDM-GQX-77fc",
"IyZDfS-HrXPdPF-dje-jKhQ":"HzmgJk-48dj4aP-TOg-Pr2n", "MZyoHb-xo1QHq [...]
+{"0OWzYa-vHt55Te-KO2-SbSK":"pPW1Lp-cEEraS6-F70-DEHu",
"80vQNK-Zc0JIEk-vIL-PwYk":"Wv17ZR-icd0BNR-N7N-YMcM",
"CB9FQo-0FwDBNg-Ktj-oRRI":"Lmygb1-RIRNPwO-vSD-fmNg",
"Ecfczs-0aE9GoC-JwP-Eg8b":"8WnzeR-7mFIbK7-7wz-W8N3",
"GNTrdK-ucPs4wU-nZb-Vj3c":"9Ks4Jf-F581kcK-Bpr-VSLT",
"Ge1aoF-Uo27vwi-oYT-fQEe":"1rSZ8k-hdi2UwO-SXg-ETIZ",
"J4rD9X-htCaoeB-pP5-c0nx":"iwVNXY-CIGRV7x-S3C-LQk1",
"RNH5AR-kJS3g7f-tky-XiZm":"seV1Ie-3AFJa06-eF3-7KhV",
"SEJ2dS-iwtY85M-JRt-yVft":"BZOnTC-rqLYh75-3ut-S47c", "U4L4PW-7na9ki [...]
+{"3Et7fv-M72tm16-gbU-cOLu":"xANIdF-mI1CNOr-pWV-X8Is",
"3GJ6pH-RY6Kw3Z-gG8-4sYX":"huMfKy-sTFvRp6-5pS-r4RY",
"BZuShL-qdH02tB-aHy-XEnD":"4QAq1j-4EpEJjR-upF-0eHf",
"D8yKgL-yZKJfqR-LBC-g1qD":"zHmIcI-z7xIGYs-0FS-OJz8",
"EKI9oO-yvdverI-vKs-z0dr":"6KPSy8-TmhMGby-Oju-AgEl",
"GGBFWm-e0CWbgN-k2i-V0Do":"SpLMnJ-AY2GIZ7-aVC-5fbv",
"Nm5GIW-2xOq5Nv-vMX-xZOc":"HeRi0T-zceRW1H-dye-hINv",
"OKkhNd-byVdWKi-rDX-MBhy":"QYqpfX-fVdjZEh-ZT5-10v2",
"TCvUh6-glhQGOr-RAz-DVVz":"Uxeb7F-jb5OTJX-Ws8-g8bD", "ZGNoc0-QzE2WG [...]
+{"0ea0sV-GxptYqE-blm-B9xZ":"QVhsnF-xqFSbmA-07t-qirt",
"33C13P-5WhvJV8-2sD-KBqe":"C5wi7g-xF891IH-tQ1-EDbn",
"6HkWEh-r4h40fS-bkj-5wLR":"Ep25v6-RbnaMFu-yKQ-j8ch",
"Ga18hq-cJ7e4lj-uGt-kIba":"NGX5No-hGtiw8d-jfY-bVmb",
"JACjUX-un5ejZT-i8B-bM1x":"bAXg07-KY89bjS-epB-ygOn",
"OqZD28-IkYsVmL-IFu-881g":"RHdn0i-8VI5Ash-pn7-K7ZU",
"TYKXmD-4EVB6ZU-EEw-Wfmw":"Q3Z4Xs-mhIBXGX-C7K-jpfu",
"TbtmYG-dr82vFI-9Ik-8tHv":"BnMiUu-GAdWoVd-QjL-3uMS",
"XSvkAW-l583a3t-rZt-Jjuk":"fj0Sbf-R1611Sx-fDd-pwuM", "jsacjC-mAa1Fk [...]
+{"EHPywb-F9iDeuX-e86-KwET":"fTNvrU-Ikjq1Cm-Mbu-UnVy",
"HnJPY4-uouJIbu-LB1-OhqO":"98knJL-MUz9JPZ-5LP-3JsR",
"IFxFdo-5PrPzXk-2FU-LHce":"3aP7Zr-PSfBxXu-5DE-nfVS",
"LgnpjC-ZaHP1La-zam-zXIm":"ZskcNG-3H1fd5Y-Vyl-h5GN",
"RS6nAv-qEEaYY2-6Ds-yUhy":"HM7gTt-5MUkquH-Txr-t7mD",
"TKwaKm-ZOQjsHG-PwC-0fiA":"0LgFaG-geVXe50-3UX-tODS",
"WWtatH-QetCdFe-Eib-47O2":"3iwwNh-jB1qllf-n3T-bPJ4",
"f0nWDp-3qaTaM7-J7D-mFrm":"IHzZLg-aVIyuP0-LIW-mQXH",
"i43Ahd-CsQusaH-Omb-ORIL":"RTq87y-Fj0hdGu-J7L-qqj6", "idEyZz-g7i92c [...]
+{"29pAXc-ONnsxuI-FEB-YbC2":"Q8NgSG-qvUmRj1-nPa-GDDl",
"3IfJH4-Ze2BVSr-XEV-W7iH":"xvZI5b-WcxTY5W-yuy-XJPM",
"3R6qsn-mmMb9zJ-Zhl-LWhl":"hVJwfD-ysvwyQm-L7o-6YE7",
"6G1zZ5-dmrj5IM-xuz-tBGt":"Yaczkp-vsaALj7-P58-sInO",
"7aaMuV-sByfVBd-c2C-GZoC":"0KoqOO-0mBdHLL-wTE-CyGD",
"AqIVAA-2zsBmNO-aev-zJKB":"wRhKLC-S1zCBjm-b5G-Kxvb",
"ByWvzj-8WMBhwQ-t5K-CMk4":"zDp1hy-oO8MzyD-j2D-u4Dd",
"CLElsX-o2zma1N-Z1U-IOYH":"O6CV1V-GyRb3xj-BDz-rjWd",
"LdDTw3-2F11YTZ-6bc-XbJA":"Dced4R-xq9sbe1-ivj-38jX", "RlFe7w-J3rXsx [...]
+
+-- !sql12 --
+id int Yes false \N NONE
+m map<text,text> Yes false \N NONE
+
+-- !sql13 --
+1 1.1234 12.123456 123.123456789876 12 1234.123456789
123
+2 1234.1234 123456789123.123456
12345678912345678912345678.123456789876 123456789
123456789123456789.123456780 987654321
+3 1234.0000 123456789123.000000
12345678912345678912345678.000000000000 123456789
123456789123456789.000000000 987654321
+4 \N \N 123.123456789876 12 \N 123
+5 1.1234 12.123456 \N \N 1234.123456789 \N
+
+-- !sql14 --
+1 1.1234 12.123456 123.123456789876 12 1234.123456789
123
+2 1234.1234 123456789123.123456
12345678912345678912345678.123456789876 123456789
123456789123456789.123456780 987654321
+3 1234.0000 123456789123.000000
12345678912345678912345678.000000000000 123456789
123456789123456789.000000000 987654321
+4 \N \N 123.123456789876 12 \N 123
+5 1.1234 12.123456 \N \N 1234.123456789 \N
+
+-- !sql15 --
+204
+
+-- !sql16 --
+204
+
+-- !sql17 --
+c1 text Yes false \N NONE
+
+-- !sql18 --
+{"description": "Large Movie Review Dataset.\\nThis is a dataset for binary
sentiment classification containing substantially more data than previous
benchmark datasets. We provide a set of 25,000 highly polar movie reviews for
training, and 25,000 for testing. There is additional unlabeled data for use as
well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\\n author =
{Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan
and Ng, Andrew Y. and [...]
+
+-- !sql19 --
+!!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this
film had an ending TO spoil... I only started watching it in the middle, after
Matt had gotten into Sarah's body, but then I became fascinated by the
bizarreness of the plot, even for a Channel 5 movie... and couldn't possibly
see how Matt wld end up happy. What about his fiancee? At one stage looked like
he was gonna get with his best friend, surely icky and wrong... and then the
whole 'oggi oggi oggi' thing [...]
+
+-- !sql20 --
+!!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this
film had an ending TO spoil... I only started watching it in the middle, after
Matt had gotten into Sarah's body, but then I became fascinated by the
bizarreness of the plot, even for a Channel 5 movie... and couldn't possibly
see how Matt wld end up happy. What about his fiancee? At one stage looked like
he was gonna get with his best friend, surely icky and wrong... and then the
whole 'oggi oggi oggi' thing [...]
+
+-- !sql21 --
+A Turkish Bath sequence in a film noir located in New York in the 50's,
that must be a hint at something ! Something that curiously, in all the
previous comments, no one has pointed out , but seems to me essential to the
understanding of this movie <br /><br />the Turkish Baths sequence: a back
street at night, the entrance of a sleazy sauna, and Scalise wrapped in a
sheet, getting his thighs massaged. Steve, the masseur is of the young rough
boxer ( Beefcake!) type , and another guy [...]
+
+-- !sql21 --
+!!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this
film had an ending TO spoil... I only started watching it in the middle, after
Matt had gotten into Sarah's body, but then I became fascinated by the
bizarreness of the plot, even for a Channel 5 movie... and couldn't possibly
see how Matt wld end up happy. What about his fiancee? At one stage looked like
he was gonna get with his best friend, surely icky and wrong... and then the
whole 'oggi oggi oggi' thing [...]
+
diff --git a/regression-test/suites/external_table_p0/tvf/test_http_tvf.groovy
b/regression-test/suites/external_table_p0/tvf/test_http_tvf.groovy
new file mode 100644
index 00000000000..133b992e8c4
--- /dev/null
+++ b/regression-test/suites/external_table_p0/tvf/test_http_tvf.groovy
@@ -0,0 +1,248 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Licensed to the Apache Software Foundation (ASF) ...
+import com.sun.net.httpserver.HttpServer
+import com.sun.net.httpserver.HttpHandler
+import com.sun.net.httpserver.HttpExchange
+import java.nio.file.Files
+import java.nio.file.Paths
+import java.net.InetSocketAddress
+
+suite("test_http_tvf", "p2") {
+ // csv
+ qt_sql01 """
+ SELECT *
+ FROM http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/http_stream/all_types.csv",
+ "format" = "csv",
+ "column_separator" = ","
+ )
+ ORDER BY c1 limit 10;
+ """
+
+ qt_sql02 """
+ SELECT count(*)
+ FROM http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/http_stream/all_types.csv",
+ "format" = "csv",
+ "column_separator" = ","
+ );
+ """
+
+ qt_sql03 """
+ desc function
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/http_stream/all_types.csv",
+ "format" = "csv",
+ "column_separator" = ","
+ );
+ """
+
+ qt_sql04 """
+ desc function
+ file(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/http_stream/all_types.csv",
+ "format" = "csv",
+ "fs.http.support" = "true",
+ "column_separator" = ","
+ );
+ """
+
+ // csv with gz
+ qt_sql05 """
+ desc function
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/all_types.csv.gz",
+ "format" = "csv",
+ "column_separator" = ",",
+ "compress_type" = "gz"
+ );
+ """
+
+ qt_sql05 """
+ select count(*) from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/all_types.csv.gz",
+ "format" = "csv",
+ "column_separator" = ",",
+ "compress_type" = "gz"
+ );
+ """
+
+ // json
+ qt_sql06 """
+ select count(*) from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/basic_data.json",
+ "format" = "json",
+ "strip_outer_array" = true
+ );
+ """
+
+ qt_sql07 """
+ desc function
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/basic_data.json",
+ "format" = "json",
+ "strip_outer_array" = true
+ );
+ """
+
+ // parquet/orc
+ qt_sql08 """
+ select * from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/external_table_p0/tvf/t.parquet",
+ "format" = "parquet"
+ ) order by id limit 10;
+ """
+
+ qt_sql09 """
+ select arr_map, id from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/external_table_p0/tvf/t.parquet",
+ "format" = "parquet"
+ ) order by id limit 10;
+ """
+
+ qt_sql10 """
+ desc function
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/external_table_p0/tvf/t.parquet",
+ "format" = "parquet"
+ );
+ """
+
+ qt_sql11 """
+ select m, id from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/types/complex_types/mm.orc",
+ "format" = "orc"
+ ) order by id limit 10;
+ """
+
+ qt_sql12 """
+ desc function
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/types/complex_types/mm.orc",
+ "format" = "orc"
+ );
+ """
+
+ // non range
+ test {
+ sql """
+ select * from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/test_decimal.parquet",
+ "format" = "parquet",
+ "http.enable.range.request" = "false",
+ "http.max.request.size.bytes" = "1000"
+ );
+ """
+ exception """exceeds maximum allowed size (1000 bytes"""
+ }
+
+ qt_sql13 """
+ select * from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/test_decimal.parquet",
+ "format" = "parquet",
+ "http.enable.range.request" = "true",
+ "http.max.request.size.bytes" = "1000"
+ ) order by id;
+ """
+
+ qt_sql14 """
+ select * from
+ http(
+ "uri" =
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/test_decimal.parquet",
+ "format" = "parquet",
+ "http.enable.range.request" = "true",
+ "http.max.request.size.bytes" = "2000"
+ ) order by id;
+ """
+
+ // hf
+ qt_sql15 """
+ select count(*) from
+ http(
+ "uri" =
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv",
+ "format" = "csv"
+ );
+ """
+
+ qt_sql16 """
+ select count(*) from
+ http(
+ "uri" =
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/*.csv",
+ "format" = "csv"
+ );
+ """
+
+ qt_sql17 """
+ desc function
+ http(
+ "uri" =
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/*.csv",
+ "format" = "csv"
+ );
+ """
+
+ // branch
+ qt_sql18 """
+ select * from
+ http(
+ "uri" = "hf://datasets/stanfordnlp/imdb@script/dataset_infos.json",
+ "format" = "json"
+ );
+ """
+
+ qt_sql19 """
+ select * from
+ http(
+ "uri" =
"hf://datasets/stanfordnlp/imdb@main/plain_text/test-00000-of-00001.parquet",
+ "format" = "parquet"
+ ) order by text limit 1;
+ """
+
+ // wildcard
+ qt_sql20 """
+ select * from
+ http(
+ "uri" =
"hf://datasets/stanfordnlp/imdb@main/*/test-00000-of-00001.parquet",
+ "format" = "parquet"
+ ) order by text limit 1;
+ """
+
+ qt_sql21 """
+ select * from
+ http(
+ "uri" = "hf://datasets/stanfordnlp/imdb@main/*/*.parquet",
+ "format" = "parquet"
+ ) order by text limit 1;
+ """
+
+ qt_sql21 """
+ select * from
+ http(
+ "uri" =
"hf://datasets/stanfordnlp/imdb@main/**/test-00000-of-0000[1].parquet",
+ "format" = "parquet"
+ ) order by text limit 1;
+ """
+}
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]