(doris) branch master updated: [feat](tvf) support huggingface with http tvf (#58049)

morningman Tue, 18 Nov 2025 20:55:47 -0800

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new eae1b88df9f [feat](tvf) support huggingface with http tvf (#58049)
eae1b88df9f is described below

commit eae1b88df9fd707111b8a9e8ed2d803b57956e2a
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Wed Nov 19 12:55:31 2025 +0800

    [feat](tvf) support huggingface with http tvf (#58049)
    
    ### What problem does this PR solve?
    
    Related Issue: #57242
    
    Support reading huggingface with http tvf:
    
    ```
    select * from
    http(
        "uri" = 
"hf://datasets/stanfordnlp/imdb@main/**/test-00000-of-0000[1].parquet",
        "format" = "parquet"
    ) order by text limit 1;
    ```
    1. Support wildcard
    2. Support branch
    3. Support datasets and spaces of huggingface
---
 .../property/storage/HttpProperties.java           |  16 +-
 .../doris/httpv2/rest/manager/HttpUtils.java       |  13 +-
 .../org/apache/doris/tablefunction/HFUtils.java    | 755 +++++++++++++++++++++
 .../tablefunction/HttpTableValuedFunction.java     |  31 +-
 .../apache/doris/tablefunction/HFUtilsTest.java    | 416 ++++++++++++
 .../data/external_table_p0/tvf/test_http_tvf.out   |  45 +-
 .../external_table_p0/tvf/test_http_tvf.groovy     |  72 +-
 7 files changed, 1314 insertions(+), 34 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HttpProperties.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HttpProperties.java
index 72990e79a36..b6b9eaa63c6 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HttpProperties.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HttpProperties.java
@@ -20,6 +20,7 @@ package org.apache.doris.datasource.property.storage;
 import org.apache.doris.common.UserException;
 
 import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Maps;
 import org.apache.hudi.common.util.MapUtils;
 
 import java.util.Map;
@@ -41,8 +42,8 @@ public class HttpProperties extends StorageProperties {
 
     @Override
     public String validateAndNormalizeUri(String url) throws UserException {
-        if (url == null || (!url.startsWith("http://";) && 
!url.startsWith("https://";))) {
-            throw new UserException("Invalid http url: " + url);
+        if (url == null || (!url.startsWith("http://";) && 
!url.startsWith("https://";) && !url.startsWith("hf://"))) {
+            throw new UserException("Invalid http/hf url: " + url);
         }
         return url;
     }
@@ -77,4 +78,15 @@ public class HttpProperties extends StorageProperties {
     protected Set<String> schemas() {
         return ImmutableSet.of("http");
     }
+
+    public Map<String, String> getHeaders() {
+        Map<String, String> headers = Maps.newHashMap();
+        for (Map.Entry<String, String> entry : origProps.entrySet()) {
+            if (entry.getKey().toLowerCase().startsWith("http.header.")) {
+                String headerKey = 
entry.getKey().substring("http.header.".length());
+                headers.put(headerKey, entry.getValue());
+            }
+        }
+        return headers;
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java 
b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java
index 902262a5eb4..5eb9a5afd33 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/manager/HttpUtils.java
@@ -20,6 +20,7 @@ package org.apache.doris.httpv2.rest.manager;
 import org.apache.doris.catalog.Env;
 import org.apache.doris.common.Config;
 import org.apache.doris.common.Pair;
+import org.apache.doris.common.util.Util;
 import org.apache.doris.httpv2.entity.ResponseBody;
 import org.apache.doris.persist.gson.GsonUtils;
 import org.apache.doris.system.Frontend;
@@ -37,6 +38,8 @@ import org.apache.http.entity.StringEntity;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClientBuilder;
 import org.apache.http.util.EntityUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 
 import java.io.IOException;
 import java.net.HttpURLConnection;
@@ -51,6 +54,8 @@ import java.util.stream.Collectors;
  * used to forward http requests from manager to be.
  */
 public class HttpUtils {
+    private static final Logger LOG = LogManager.getLogger(HttpUtils.class);
+
     static final int REQUEST_SUCCESS_CODE = 0;
     static final int DEFAULT_TIME_OUT_MS = 2000;
 
@@ -146,7 +151,7 @@ public class HttpUtils {
      * @throws IOException if there's an error connecting to the HTTP resource
      * @throws IllegalArgumentException if the URI is null or invalid
      */
-    public static long getHttpFileSize(String uri) throws IOException {
+    public static long getHttpFileSize(String uri, Map<String, String> 
headers) throws IOException {
         if (uri == null || uri.trim().isEmpty()) {
             throw new IllegalArgumentException("HTTP URI is null or empty");
         }
@@ -164,6 +169,9 @@ public class HttpUtils {
             // Set common headers
             connection.setRequestProperty("User-Agent", "Doris-HttpUtils/1.0");
             connection.setRequestProperty("Accept", "*/*");
+            for (Map.Entry<String, String> entry : headers.entrySet()) {
+                connection.setRequestProperty(entry.getKey(), 
entry.getValue());
+            }
 
             // Connect and get response
             connection.connect();
@@ -187,7 +195,8 @@ public class HttpUtils {
                         + ", message: " + connection.getResponseMessage());
             }
         } catch (IOException e) {
-            throw new IOException("Failed to get file size for URI: " + uri, 
e);
+            LOG.warn("Failed to get file size for URI: {}", uri, e);
+            throw new IOException("Failed to get file size for URI: " + uri + 
". " + Util.getRootCauseMessage(e), e);
         } finally {
             if (connection != null) {
                 connection.disconnect();
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HFUtils.java 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HFUtils.java
new file mode 100644
index 00000000000..f97879632d4
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HFUtils.java
@@ -0,0 +1,755 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.tablefunction;
+
+import org.apache.doris.common.AnalysisException;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Strings;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.util.EntityUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.IOException;
+import java.nio.file.FileSystems;
+import java.nio.file.PathMatcher;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Utility class for handling HuggingFace URLs and converting them to HTTP 
URLs.
+ *
+ * This class provides functionality to parse hf:// URLs and convert them to
+ * actual HTTP URLs that can be used to access files on HuggingFace Hub.
+ *
+ * Supported URL formats:
+ * - hf://datasets/username/dataset-name/path/to/file.parquet
+ * - hf://datasets/username/dataset-name@revision/path/to/file.parquet
+ * - hf://spaces/username/space-name/path/to/file.txt
+ *
+ * Example usage:
+ * String hfUrl = "hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+ * String httpUrl = HFUtils.convertHfUrlToHttpUrl(hfUrl);
+ * // Returns: 
https://huggingface.co/datasets/lhoestq/demo1/resolve/main/default/train/0000.parquet
+ */
+public class HFUtils {
+    private static final Logger LOG = LogManager.getLogger(HFUtils.class);
+
+    // Constants
+    private static final String HF_SCHEME = "hf://";
+    private static final String DEFAULT_ENDPOINT = "https://huggingface.co";;
+    private static final String DEFAULT_REVISION = "main";
+    private static final String REPO_TYPE_DATASETS = "datasets";
+    private static final String REPO_TYPE_SPACES = "spaces";
+
+    // HTTP Client Configuration
+    private static final int DEFAULT_TIMEOUT_MS = 30000; // 30 seconds
+    private static final int DEFAULT_CONNECT_TIMEOUT_MS = 10000; // 10 seconds
+    private static final int DEFAULT_PAGE_LIMIT = 1000;
+
+    /**
+     * Parsed HuggingFace URL components
+     */
+    public static class ParsedHFUrl {
+        private String endpoint = DEFAULT_ENDPOINT;
+        private String repoType;
+        private String repository;
+        private String revision = DEFAULT_REVISION;
+        private String path;
+
+        public String getEndpoint() {
+            return endpoint;
+        }
+
+        public void setEndpoint(String endpoint) {
+            this.endpoint = endpoint;
+        }
+
+        public String getRepoType() {
+            return repoType;
+        }
+
+        public void setRepoType(String repoType) {
+            this.repoType = repoType;
+        }
+
+        public String getRepository() {
+            return repository;
+        }
+
+        public void setRepository(String repository) {
+            this.repository = repository;
+        }
+
+        public String getRevision() {
+            return revision;
+        }
+
+        public void setRevision(String revision) {
+            this.revision = revision;
+        }
+
+        public String getPath() {
+            return path;
+        }
+
+        public void setPath(String path) {
+            this.path = path;
+        }
+
+        @Override
+        public String toString() {
+            return String.format("ParsedHFUrl{endpoint='%s', repoType='%s', 
repository='%s', revision='%s', path='%s'}",
+                    endpoint, repoType, repository, revision, path);
+        }
+    }
+
+    /**
+     * Convert a HuggingFace URL to an HTTP URL
+     *
+     * @param hfUrl The hf:// URL to convert
+     * @return The corresponding HTTP URL
+     * @throws AnalysisException if the URL format is invalid
+     */
+    @VisibleForTesting
+    public static String convertHfUrlToHttpUrl(String hfUrl) throws 
AnalysisException {
+        if (Strings.isNullOrEmpty(hfUrl)) {
+            throw new AnalysisException("HuggingFace URL cannot be null or 
empty");
+        }
+
+        ParsedHFUrl parsedUrl = parseHfUrl(hfUrl);
+        return buildHttpUrl(parsedUrl);
+    }
+
+    /**
+     * Parse a HuggingFace URL into its components
+     *
+     * @param url The hf:// URL to parse
+     * @return ParsedHFUrl object containing the parsed components
+     * @throws AnalysisException if the URL format is invalid
+     */
+    @VisibleForTesting
+    public static ParsedHFUrl parseHfUrl(String url) throws AnalysisException {
+        if (Strings.isNullOrEmpty(url)) {
+            throw new AnalysisException("URL cannot be null or empty");
+        }
+
+        if (!url.startsWith(HF_SCHEME)) {
+            throw new AnalysisException("URL must start with 'hf://', got: " + 
url);
+        }
+
+        ParsedHFUrl result = new ParsedHFUrl();
+
+        // Remove the hf:// prefix
+        String remaining = url.substring(HF_SCHEME.length());
+
+        if (remaining.isEmpty()) {
+            throwParseError(url);
+        }
+
+        String[] parts = remaining.split("/", -1); // -1 to keep empty strings
+
+        if (parts.length < 4) {
+            throwParseError(url);
+        }
+
+        // Parse repository type
+        result.setRepoType(parts[0]);
+        if (!REPO_TYPE_DATASETS.equals(result.getRepoType()) && 
!REPO_TYPE_SPACES.equals(result.getRepoType())) {
+            throw new AnalysisException(
+                String.format("Currently only supports 'datasets' and 'spaces' 
repository types, got: '%s' in URL: %s",
+                    result.getRepoType(), url));
+        }
+
+        // Parse username and repository name
+        String username = parts[1];
+        String repoName = parts[2];
+
+        if (username.isEmpty() || repoName.isEmpty()) {
+            throwParseError(url);
+        }
+
+        // Check if repository name contains revision
+        int atIndex = repoName.indexOf('@');
+        if (atIndex != -1) {
+            String actualRepoName = repoName.substring(0, atIndex);
+            result.setRevision(repoName.substring(atIndex + 1));
+
+            if (actualRepoName.isEmpty() || result.getRevision().isEmpty()) {
+                throwParseError(url);
+            }
+            result.setRepository(username + "/" + actualRepoName);
+        } else {
+            result.setRepository(username + "/" + repoName);
+        }
+
+        // Build the path from remaining parts
+        StringBuilder pathBuilder = new StringBuilder();
+        for (int i = 3; i < parts.length; i++) {
+            pathBuilder.append("/").append(parts[i]);
+        }
+        String rawPath = pathBuilder.toString();
+
+        // Handle HuggingFace web interface paths like /blob/main/ or 
/tree/main/
+        // These should be converted to proper API paths
+        if (rawPath.startsWith("/blob/") || rawPath.startsWith("/tree/")) {
+            // Extract revision and actual path
+            String[] pathParts = rawPath.substring(1).split("/", 3); // Remove 
leading slash and split
+            if (pathParts.length >= 2) {
+                // pathParts[0] is "blob" or "tree" - we don't need to use it
+                String pathRevision = pathParts[1]; // revision like "main"
+                String actualPath = pathParts.length > 2 ? "/" + pathParts[2] 
: "";
+
+                // Use the revision from the path if not already specified via 
@
+                if (result.getRevision().equals(DEFAULT_REVISION)) {
+                    result.setRevision(pathRevision);
+                }
+
+                result.setPath(actualPath);
+            } else {
+                result.setPath(rawPath);
+            }
+        } else {
+            result.setPath(rawPath);
+        }
+
+        // If no path parts exist, set to empty string
+        if (result.getPath().isEmpty()) {
+            result.setPath("");
+        }
+        // Note: if path is "/" (from trailing slash), keep it as is
+
+        LOG.debug("Parsed HF URL: {} -> {}", url, result);
+        return result;
+    }
+
+    /**
+     * Build HTTP URL from parsed HF URL components
+     *
+     * @param parsedUrl The parsed HF URL components
+     * @return The HTTP URL string
+     */
+    private static String buildHttpUrl(ParsedHFUrl parsedUrl) {
+        // URL format: 
{endpoint}/{repo_type}/{repository}/resolve/{revision}{path}
+        StringBuilder httpUrl = new StringBuilder();
+
+        httpUrl.append(parsedUrl.getEndpoint());
+        if (!parsedUrl.getEndpoint().endsWith("/")) {
+            httpUrl.append("/");
+        }
+
+        httpUrl.append(parsedUrl.getRepoType()).append("/");
+        httpUrl.append(parsedUrl.getRepository()).append("/");
+        httpUrl.append("resolve").append("/");
+        httpUrl.append(parsedUrl.getRevision());
+        httpUrl.append(parsedUrl.getPath());
+
+        String result = httpUrl.toString();
+        LOG.debug("Built HTTP URL: {}", result);
+        return result;
+    }
+
+    /**
+     * Validate if a URL is a valid HuggingFace URL
+     *
+     * @param url The URL to validate
+     * @return true if it's a valid hf:// URL, false otherwise
+     */
+    @VisibleForTesting
+    public static boolean isValidHfUrl(String url) {
+        if (Strings.isNullOrEmpty(url)) {
+            return false;
+        }
+
+        try {
+            parseHfUrl(url);
+            return true;
+        } catch (AnalysisException e) {
+            LOG.debug("Invalid HF URL: {}, error: {}", url, e.getMessage());
+            return false;
+        }
+    }
+
+    /**
+     * Get the tree API URL for listing files in a repository
+     * This is useful for implementing glob patterns or directory listing
+     *
+     * @param parsedUrl The parsed HF URL components
+     * @param limit Optional limit for the number of results (0 means no limit)
+     * @return The tree API URL
+     */
+    @VisibleForTesting
+    public static String buildTreeApiUrl(ParsedHFUrl parsedUrl, int limit) {
+        return buildTreeApiUrl(parsedUrl, parsedUrl.getPath(), limit);
+    }
+
+    /**
+     * Get the tree API URL for listing files in a specific path
+     *
+     * @param parsedUrl The parsed HF URL components
+     * @param path The specific path to list
+     * @param limit Optional limit for the number of results (0 means no limit)
+     * @return The tree API URL
+     */
+    private static String buildTreeApiUrl(ParsedHFUrl parsedUrl, String path, 
int limit) {
+        // URL format: 
{endpoint}/api/{repo_type}/{repository}/tree/{revision}{path}
+        StringBuilder treeUrl = new StringBuilder();
+
+        treeUrl.append(parsedUrl.getEndpoint());
+        if (!parsedUrl.getEndpoint().endsWith("/")) {
+            treeUrl.append("/");
+        }
+
+        treeUrl.append("api").append("/");
+        treeUrl.append(parsedUrl.getRepoType()).append("/");
+        treeUrl.append(parsedUrl.getRepository()).append("/");
+        treeUrl.append("tree").append("/");
+        treeUrl.append(parsedUrl.getRevision());
+
+        // Add path if provided
+        if (!Strings.isNullOrEmpty(path) && !"/".equals(path)) {
+            if (!path.startsWith("/")) {
+                treeUrl.append("/");
+            }
+            treeUrl.append(path);
+        }
+
+        if (limit > 0) {
+            treeUrl.append("?limit=").append(limit);
+        }
+
+        String result = treeUrl.toString();
+        LOG.debug("Built tree API URL: {}", result);
+        return result;
+    }
+
+    /**
+     * Extract repository information from HF URL for display purposes
+     *
+     * @param hfUrl The hf:// URL
+     * @return A human-readable repository description
+     * @throws AnalysisException if the URL is invalid
+     */
+    @VisibleForTesting
+    public static String getRepositoryInfo(String hfUrl) throws 
AnalysisException {
+        ParsedHFUrl parsed = parseHfUrl(hfUrl);
+        return String.format("%s/%s@%s", parsed.getRepoType(), 
parsed.getRepository(), parsed.getRevision());
+    }
+
+    /**
+     * Expand a HuggingFace URL with glob patterns to matching file URLs
+     *
+     * @param hfGlobUrl The hf:// URL with glob patterns
+     * @return List of HTTP URLs that match the glob pattern
+     * @throws AnalysisException if the URL format is invalid or glob 
processing fails
+     */
+    public static List<String> expandGlob(String hfGlobUrl) throws 
AnalysisException {
+        return expandGlob(hfGlobUrl, null);
+    }
+
+    /**
+     * Expand a HuggingFace URL with glob patterns to matching file URLs
+     *
+     * @param hfGlobUrl The hf:// URL with glob patterns
+     * @param authToken Optional authentication token for private repositories
+     * @return List of HTTP URLs that match the glob pattern
+     * @throws AnalysisException if the URL format is invalid or glob 
processing fails
+     */
+    public static List<String> expandGlob(String hfGlobUrl, String authToken) 
throws AnalysisException {
+        if (Strings.isNullOrEmpty(hfGlobUrl)) {
+            throw new AnalysisException("HuggingFace glob URL cannot be null 
or empty");
+        }
+
+        // Parse the glob URL
+        ParsedHFUrl parsedUrl = parseHfUrl(hfGlobUrl);
+
+        // Check if the path contains wildcard characters
+        String path = parsedUrl.getPath();
+        if (!containsWildcards(path)) {
+            // No wildcards, return the single file
+            List<String> result = new ArrayList<>();
+            result.add(buildHttpUrl(parsedUrl));
+            return result;
+        }
+
+        // Find the longest prefix without wildcards
+        String sharedPath = getLongestPrefixWithoutWildcards(path);
+
+        // Prepare headers
+        Map<String, String> headers = new HashMap<>();
+        if (!Strings.isNullOrEmpty(authToken)) {
+            headers.put("Authorization", "Bearer " + authToken);
+        }
+
+        List<String> result = new ArrayList<>();
+
+        try {
+            // Get all files and directories to process
+            List<String> pathsToProcess = new ArrayList<>();
+            pathsToProcess.add(sharedPath);
+
+            List<String> allFilePaths = new ArrayList<>();
+
+            // Calculate the depth needed for recursion
+            // Count the number of path components in the pattern after the 
shared prefix
+            String remainingPattern = path.substring(sharedPath.length());
+            int patternDepth = splitPath(remainingPattern).size();
+
+            // If pattern contains **, we need unlimited recursion
+            boolean unlimitedRecursion = path.contains("**");
+
+            // For a pattern like /*/*.parquet (depth=2), we need to recurse 
into depth 1
+            // to list files at depth 2. So maxRecursionDepth = patternDepth - 
1
+            // But if patternDepth is 1 or less, we still need depth 0, so use 
Math.max
+            int maxRecursionDepth = unlimitedRecursion ? Integer.MAX_VALUE : 
Math.max(0, patternDepth - 1);
+
+            // Track depth for each path being processed
+            Map<String, Integer> pathDepths = new HashMap<>();
+            pathDepths.put(sharedPath, 0);
+
+            // Process directories recursively if needed
+            while (!pathsToProcess.isEmpty()) {
+                String currentPath = pathsToProcess.remove(0);
+                int currentDepth = pathDepths.getOrDefault(currentPath, 0);
+
+                // List files in current directory
+                List<String> files = new ArrayList<>();
+                List<String> directories = new ArrayList<>();
+                listHuggingFaceFiles(parsedUrl, currentPath, headers, files, 
directories);
+
+                // Add all file paths
+                allFilePaths.addAll(files);
+
+                // Add directories for recursive processing based on pattern 
depth
+                // We need to recurse if current depth is less than max 
recursion depth
+                if (currentDepth < maxRecursionDepth) {
+                    for (String dir : directories) {
+                        pathsToProcess.add(dir);
+                        pathDepths.put(dir, currentDepth + 1);
+                    }
+                }
+            }
+
+            // Filter files using glob pattern matching
+            List<String> patternComponents = splitPath(path);
+
+            for (String filePath : allFilePaths) {
+                List<String> fileComponents = splitPath(filePath);
+
+                if (matchPathComponents(fileComponents, patternComponents)) {
+                    // Build the complete HTTP URL for the matched file
+                    ParsedHFUrl fileUrl = new ParsedHFUrl();
+                    fileUrl.setEndpoint(parsedUrl.getEndpoint());
+                    fileUrl.setRepoType(parsedUrl.getRepoType());
+                    fileUrl.setRepository(parsedUrl.getRepository());
+                    fileUrl.setRevision(parsedUrl.getRevision());
+                    fileUrl.setPath(filePath);
+
+                    String httpUrl = buildHttpUrl(fileUrl);
+                    result.add(httpUrl);
+                }
+            }
+
+        } catch (Exception e) {
+            throw new AnalysisException("Failed to expand glob pattern: " + 
e.getMessage());
+        }
+
+        return result;
+    }
+
+    /**
+     * Create HTTP client with proper configuration
+     */
+    private static CloseableHttpClient createHttpClient() {
+        RequestConfig config = RequestConfig.custom()
+                .setConnectTimeout(DEFAULT_CONNECT_TIMEOUT_MS)
+                .setConnectionRequestTimeout(DEFAULT_TIMEOUT_MS)
+                .setSocketTimeout(DEFAULT_TIMEOUT_MS)
+                .build();
+
+        return HttpClientBuilder.create()
+                .setDefaultRequestConfig(config)
+                .build();
+    }
+
+    /**
+     * Execute HTTP GET request
+     */
+    private static String executeHttpGet(String url, Map<String, String> 
headers) throws IOException {
+        try (CloseableHttpClient client = createHttpClient()) {
+            HttpGet httpGet = new HttpGet(url);
+
+            // Set headers
+            if (headers != null) {
+                for (Map.Entry<String, String> entry : headers.entrySet()) {
+                    httpGet.setHeader(entry.getKey(), entry.getValue());
+                }
+            }
+
+            // Set User-Agent
+            httpGet.setHeader("User-Agent", "Doris-HFUtils/1.0");
+
+            return client.execute(httpGet, response -> {
+                int statusCode = response.getStatusLine().getStatusCode();
+                String responseBody = 
EntityUtils.toString(response.getEntity());
+
+                if (statusCode >= 400) {
+                    throw new IOException("HTTP " + statusCode + ": " + 
responseBody);
+                }
+
+                return responseBody;
+            });
+        }
+    }
+
+    /**
+     * List files from HuggingFace API with pagination support
+     */
+    private static void listHuggingFaceFiles(ParsedHFUrl parsedUrl, String 
path,
+                                           Map<String, String> headers,
+                                           List<String> files, List<String> 
directories) throws AnalysisException {
+        // Build API URL
+        String apiUrl = buildTreeApiUrl(parsedUrl, path, DEFAULT_PAGE_LIMIT);
+
+        String nextUrl = apiUrl;
+        int pageCount = 0;
+
+        while (nextUrl != null && pageCount < 100) { // Prevent infinite loops
+            try {
+                String response = executeHttpGet(nextUrl, headers);
+
+                // Parse JSON response
+                JsonArray jsonArray = 
JsonParser.parseString(response).getAsJsonArray();
+
+                for (JsonElement element : jsonArray) {
+                    JsonObject obj = element.getAsJsonObject();
+
+                    String filePath = "/" + obj.get("path").getAsString();
+                    String type = obj.get("type").getAsString();
+
+                    if ("file".equals(type)) {
+                        files.add(filePath);
+                    } else if ("directory".equals(type)) {
+                        directories.add(filePath);
+                    }
+                }
+
+                // For simplicity, we don't handle pagination in this basic 
version
+                // In a real implementation, you would parse Link headers here
+                nextUrl = null;
+                pageCount++;
+
+            } catch (Exception e) {
+                throw new AnalysisException("Failed to list files from 
HuggingFace API: " + e.getMessage());
+            }
+        }
+    }
+
+    /**
+     * Check if a path contains wildcard characters
+     *
+     * @param path The path to check
+     * @return true if the path contains wildcards, false otherwise
+     */
+    @VisibleForTesting
+    public static boolean containsWildcards(String path) {
+        if (Strings.isNullOrEmpty(path)) {
+            return false;
+        }
+        return path.contains("*") || path.contains("?") || path.contains("[") 
|| path.contains("{");
+    }
+
+    /**
+     * Get the longest prefix of a path that doesn't contain wildcards
+     *
+     * @param path The path to analyze
+     * @return The longest prefix without wildcards
+     */
+    @VisibleForTesting
+    public static String getLongestPrefixWithoutWildcards(String path) {
+        if (Strings.isNullOrEmpty(path)) {
+            return "";
+        }
+
+        int firstWildcardPos = -1;
+        for (int i = 0; i < path.length(); i++) {
+            char c = path.charAt(i);
+            if (c == '*' || c == '?' || c == '[' || c == '{') {
+                firstWildcardPos = i;
+                break;
+            }
+        }
+
+        if (firstWildcardPos == -1) {
+            return path; // No wildcards found
+        }
+
+        // Find the last slash before the first wildcard
+        String prefix = path.substring(0, firstWildcardPos);
+        int lastSlash = prefix.lastIndexOf('/');
+
+        if (lastSlash == -1) {
+            return ""; // Root path
+        }
+
+        return path.substring(0, lastSlash);
+    }
+
+    /**
+     * Match a file path against a glob pattern
+     * This is a simplified implementation based on DuckDB's Match function
+     *
+     * @param filePath The file path to match
+     * @param globPattern The glob pattern
+     * @return true if the file matches the pattern, false otherwise
+     */
+    @VisibleForTesting
+    public static boolean matchGlobPattern(String filePath, String 
globPattern) {
+        if (Strings.isNullOrEmpty(filePath) || 
Strings.isNullOrEmpty(globPattern)) {
+            return false;
+        }
+
+        try {
+            // Use Java's built-in glob pattern matching
+            PathMatcher matcher = 
FileSystems.getDefault().getPathMatcher("glob:" + globPattern);
+            return matcher.matches(Paths.get(filePath));
+        } catch (Exception e) {
+            LOG.warn("Failed to match glob pattern: {} against file: {}, 
error: {}",
+                    globPattern, filePath, e.getMessage());
+            return false;
+        }
+    }
+
+    /**
+     * Split a path into components for pattern matching
+     *
+     * @param path The path to split
+     * @return List of path components
+     */
+    @VisibleForTesting
+    public static List<String> splitPath(String path) {
+        if (Strings.isNullOrEmpty(path)) {
+            return new ArrayList<>();
+        }
+
+        List<String> components = new ArrayList<>();
+        String[] parts = path.split("/");
+        for (String part : parts) {
+            if (!part.isEmpty()) {
+                components.add(part);
+            }
+        }
+        return components;
+    }
+
+    /**
+     * Advanced pattern matching similar to DuckDB's Match function
+     * Supports ** for recursive matching
+     *
+     * @param pathComponents The path components to match
+     * @param patternComponents The pattern components
+     * @return true if the path matches the pattern, false otherwise
+     */
+    @VisibleForTesting
+    public static boolean matchPathComponents(List<String> pathComponents, 
List<String> patternComponents) {
+        return matchPathComponentsRecursive(pathComponents, 0, 
patternComponents, 0);
+    }
+
+    private static boolean matchPathComponentsRecursive(List<String> 
pathComponents, int pathIndex,
+                                                       List<String> 
patternComponents, int patternIndex) {
+        // Base cases
+        if (pathIndex >= pathComponents.size() && patternIndex >= 
patternComponents.size()) {
+            return true; // Both exhausted, match
+        }
+        if (patternIndex >= patternComponents.size()) {
+            return false; // Pattern exhausted but path remains
+        }
+        if (pathIndex >= pathComponents.size()) {
+            // Path exhausted, check if remaining pattern is all **
+            for (int i = patternIndex; i < patternComponents.size(); i++) {
+                if (!"**".equals(patternComponents.get(i))) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        String currentPattern = patternComponents.get(patternIndex);
+
+        if ("**".equals(currentPattern)) {
+            // ** matches zero or more path components
+            if (patternIndex + 1 >= patternComponents.size()) {
+                return true; // ** at end matches everything
+            }
+
+            // Try matching ** with 0, 1, 2, ... path components
+            for (int i = pathIndex; i <= pathComponents.size(); i++) {
+                if (matchPathComponentsRecursive(pathComponents, i, 
patternComponents, patternIndex + 1)) {
+                    return true;
+                }
+            }
+            return false;
+        } else {
+            // Regular pattern matching (including * and [])
+            String currentPath = pathComponents.get(pathIndex);
+            if (matchGlobPattern(currentPath, currentPattern)) {
+                return matchPathComponentsRecursive(pathComponents, pathIndex 
+ 1, patternComponents, patternIndex + 1);
+            }
+            return false;
+        }
+    }
+
+    /**
+     * Validate if a URL contains valid glob patterns
+     *
+     * @param hfUrl The hf:// URL to validate
+     * @return true if it's a valid glob URL, false otherwise
+     */
+    @VisibleForTesting
+    public static boolean isValidGlobUrl(String hfUrl) {
+        if (!isValidHfUrl(hfUrl)) {
+            return false;
+        }
+
+        try {
+            ParsedHFUrl parsed = parseHfUrl(hfUrl);
+            return containsWildcards(parsed.getPath());
+        } catch (AnalysisException e) {
+            return false;
+        }
+    }
+
+    private static void throwParseError(String url) throws AnalysisException {
+        throw new AnalysisException(
+            String.format("Failed to parse HuggingFace URL: '%s'. "
+                + "Please format URL like: 
'hf://datasets/username/dataset-name/path/to/file.parquet' "
+                + "or 
'hf://datasets/username/dataset-name@revision/path/to/file.parquet'", url));
+    }
+}
+
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HttpTableValuedFunction.java
 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HttpTableValuedFunction.java
index fdde219ce25..d8311c822a6 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HttpTableValuedFunction.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HttpTableValuedFunction.java
@@ -26,6 +26,7 @@ import org.apache.doris.httpv2.rest.manager.HttpUtils;
 import org.apache.doris.thrift.TBrokerFileStatus;
 import org.apache.doris.thrift.TFileType;
 
+import java.util.List;
 import java.util.Map;
 
 /**
@@ -51,17 +52,31 @@ public class HttpTableValuedFunction extends 
ExternalFileTableValuedFunction {
             this.uri = this.httpProperties.validateAndGetUri(props);
 
             
this.backendConnectProperties.putAll(storageProperties.getBackendConfigProperties());
-
-            this.fileStatuses.clear();
-            this.fileStatuses.add(new TBrokerFileStatus(this.uri, false, 
HttpUtils.getHttpFileSize(this.uri), true));
+            generateFileStatus();
         } catch (Exception e) {
-            throw new RuntimeException(e);
+            throw new AnalysisException("Failed check http storage props, " + 
e.getMessage(), e);
         }
+    }
 
-        // TFileFormatType t = fileFormatProperties.getFileFormatType();
-        // if (!Util.isCsvFormat(t) && t != TFileFormatType.FORMAT_JSON) {
-        //     throw new AnalysisException("http() only supports format 'csv' 
and 'json'");
-        // }
+    private void generateFileStatus() throws Exception {
+        this.fileStatuses.clear();
+        if (this.uri.startsWith("http://";) || this.uri.startsWith("https://";)) 
{
+            this.fileStatuses.add(new TBrokerFileStatus(this.uri, false,
+                    HttpUtils.getHttpFileSize(this.uri, 
this.httpProperties.getHeaders()), true));
+        } else if (this.uri.startsWith("hf://")) {
+            List<String> fileUrls = HFUtils.expandGlob(this.uri);
+            if (LOG.isDebugEnabled()) {
+                for (String fileUrl : fileUrls) {
+                    LOG.debug("HttpTableValuedFunction expand hf glob uri: 
{}", fileUrl);
+                }
+            }
+            for (String fileUrl : fileUrls) {
+                this.fileStatuses.add(new TBrokerFileStatus(fileUrl, false,
+                        HttpUtils.getHttpFileSize(fileUrl, 
this.httpProperties.getHeaders()), true));
+            }
+        } else {
+            throw new AnalysisException("HttpTableValuedFunction uri is 
invalid: " + this.uri);
+        }
     }
 
     @Override
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/tablefunction/HFUtilsTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/tablefunction/HFUtilsTest.java
new file mode 100644
index 00000000000..8d18b021af0
--- /dev/null
+++ b/fe/fe-core/src/test/java/org/apache/doris/tablefunction/HFUtilsTest.java
@@ -0,0 +1,416 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.tablefunction;
+
+import org.apache.doris.common.AnalysisException;
+import org.apache.doris.tablefunction.HFUtils.ParsedHFUrl;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+
+public class HFUtilsTest {
+
+    @Test
+    public void testValidHfUrlParsing() throws AnalysisException {
+        // Test basic dataset URL
+        String url1 = "hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+        ParsedHFUrl parsed1 = HFUtils.parseHfUrl(url1);
+
+        Assert.assertEquals("datasets", parsed1.getRepoType());
+        Assert.assertEquals("lhoestq/demo1", parsed1.getRepository());
+        Assert.assertEquals("main", parsed1.getRevision());
+        Assert.assertEquals("/default/train/0000.parquet", parsed1.getPath());
+        Assert.assertEquals("https://huggingface.co";, parsed1.getEndpoint());
+
+        // Test URL with revision
+        String url2 = "hf://datasets/username/[email protected]/path/to/file.csv";
+        ParsedHFUrl parsed2 = HFUtils.parseHfUrl(url2);
+
+        Assert.assertEquals("datasets", parsed2.getRepoType());
+        Assert.assertEquals("username/dataset", parsed2.getRepository());
+        Assert.assertEquals("v1.0", parsed2.getRevision());
+        Assert.assertEquals("/path/to/file.csv", parsed2.getPath());
+
+        // Test spaces URL
+        String url3 = "hf://spaces/gradio/calculator/app.py";
+        ParsedHFUrl parsed3 = HFUtils.parseHfUrl(url3);
+
+        Assert.assertEquals("spaces", parsed3.getRepoType());
+        Assert.assertEquals("gradio/calculator", parsed3.getRepository());
+        Assert.assertEquals("main", parsed3.getRevision());
+        Assert.assertEquals("/app.py", parsed3.getPath());
+
+        // Test URL with empty path
+        String url4 = "hf://datasets/user/repo/";
+        ParsedHFUrl parsed4 = HFUtils.parseHfUrl(url4);
+
+        Assert.assertEquals("datasets", parsed4.getRepoType());
+        Assert.assertEquals("user/repo", parsed4.getRepository());
+        Assert.assertEquals("main", parsed4.getRevision());
+        Assert.assertEquals("/", parsed4.getPath());
+
+        // Test URL with HuggingFace web interface format (/blob/main/)
+        String url5 = 
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv";
+        ParsedHFUrl parsed5 = HFUtils.parseHfUrl(url5);
+
+        Assert.assertEquals("datasets", parsed5.getRepoType());
+        Assert.assertEquals("fka/awesome-chatgpt-prompts", 
parsed5.getRepository());
+        Assert.assertEquals("main", parsed5.getRevision());
+        Assert.assertEquals("/prompts.csv", parsed5.getPath());
+
+        // Test URL with HuggingFace web interface format (/tree/v1.0/)
+        String url6 = "hf://datasets/user/dataset/tree/v1.0/data/file.txt";
+        ParsedHFUrl parsed6 = HFUtils.parseHfUrl(url6);
+
+        Assert.assertEquals("datasets", parsed6.getRepoType());
+        Assert.assertEquals("user/dataset", parsed6.getRepository());
+        Assert.assertEquals("v1.0", parsed6.getRevision());
+        Assert.assertEquals("/data/file.txt", parsed6.getPath());
+    }
+
+    @Test
+    public void testHttpUrlConversion() throws AnalysisException {
+        // Test basic conversion
+        String hfUrl1 = 
"hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+        String httpUrl1 = HFUtils.convertHfUrlToHttpUrl(hfUrl1);
+        String expected1 = 
"https://huggingface.co/datasets/lhoestq/demo1/resolve/main/default/train/0000.parquet";;
+        Assert.assertEquals(expected1, httpUrl1);
+
+        // Test conversion with revision
+        String hfUrl2 = "hf://datasets/username/[email protected]/path/to/file.csv";
+        String httpUrl2 = HFUtils.convertHfUrlToHttpUrl(hfUrl2);
+        String expected2 = 
"https://huggingface.co/datasets/username/dataset/resolve/v1.0/path/to/file.csv";;
+        Assert.assertEquals(expected2, httpUrl2);
+
+        // Test spaces conversion
+        String hfUrl3 = "hf://spaces/gradio/calculator/app.py";
+        String httpUrl3 = HFUtils.convertHfUrlToHttpUrl(hfUrl3);
+        String expected3 = 
"https://huggingface.co/spaces/gradio/calculator/resolve/main/app.py";;
+        Assert.assertEquals(expected3, httpUrl3);
+
+        // Test HuggingFace web interface format conversion
+        String hfUrl4 = 
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv";
+        String httpUrl4 = HFUtils.convertHfUrlToHttpUrl(hfUrl4);
+        String expected4 = 
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv";;
+        Assert.assertEquals(expected4, httpUrl4);
+    }
+
+    @Test
+    public void testTreeApiUrlGeneration() throws AnalysisException {
+        String hfUrl = "hf://datasets/lhoestq/demo1/default/train";
+        ParsedHFUrl parsed = HFUtils.parseHfUrl(hfUrl);
+
+        // Test without limit
+        String treeUrl1 = HFUtils.buildTreeApiUrl(parsed, 0);
+        String expected1 = 
"https://huggingface.co/api/datasets/lhoestq/demo1/tree/main/default/train";;
+        Assert.assertEquals(expected1, treeUrl1);
+
+        // Test with limit
+        String treeUrl2 = HFUtils.buildTreeApiUrl(parsed, 100);
+        String expected2 = 
"https://huggingface.co/api/datasets/lhoestq/demo1/tree/main/default/train?limit=100";;
+        Assert.assertEquals(expected2, treeUrl2);
+    }
+
+    @Test
+    public void testRepositoryInfo() throws AnalysisException {
+        String hfUrl1 = 
"hf://datasets/lhoestq/demo1/default/train/0000.parquet";
+        String repoInfo1 = HFUtils.getRepositoryInfo(hfUrl1);
+        Assert.assertEquals("datasets/lhoestq/demo1@main", repoInfo1);
+
+        String hfUrl2 = "hf://datasets/username/[email protected]/path/to/file.csv";
+        String repoInfo2 = HFUtils.getRepositoryInfo(hfUrl2);
+        Assert.assertEquals("datasets/username/[email protected]", repoInfo2);
+    }
+
+    @Test
+    public void testValidHfUrlValidation() {
+        // Valid URLs
+        
Assert.assertTrue(HFUtils.isValidHfUrl("hf://datasets/user/repo/file.txt"));
+        
Assert.assertTrue(HFUtils.isValidHfUrl("hf://spaces/user/space/app.py"));
+        
Assert.assertTrue(HFUtils.isValidHfUrl("hf://datasets/user/[email protected]/file.txt"));
+
+        // Invalid URLs
+        Assert.assertFalse(HFUtils.isValidHfUrl(null));
+        Assert.assertFalse(HFUtils.isValidHfUrl(""));
+        Assert.assertFalse(HFUtils.isValidHfUrl("http://example.com";));
+        Assert.assertFalse(HFUtils.isValidHfUrl("hf://"));
+        Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets"));
+        Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets/"));
+        Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets/user"));
+        Assert.assertFalse(HFUtils.isValidHfUrl("hf://datasets/user/repo")); 
// Missing path
+        
Assert.assertFalse(HFUtils.isValidHfUrl("hf://invalid/user/repo/file.txt"));
+    }
+
+    @Test
+    public void testInvalidUrlExceptions() {
+        // Test null/empty URL
+        try {
+            HFUtils.parseHfUrl(null);
+            Assert.fail("Should throw AnalysisException for null URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("cannot be null or 
empty"));
+        }
+
+        try {
+            HFUtils.parseHfUrl("");
+            Assert.fail("Should throw AnalysisException for empty URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("cannot be null or 
empty"));
+        }
+
+        // Test non-hf URL
+        try {
+            HFUtils.parseHfUrl("http://example.com";);
+            Assert.fail("Should throw AnalysisException for non-hf URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("must start with 
'hf://'"));
+        }
+
+        // Test incomplete URL
+        try {
+            HFUtils.parseHfUrl("hf://datasets");
+            Assert.fail("Should throw AnalysisException for incomplete URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("Failed to parse 
HuggingFace URL"));
+        }
+
+        // Test invalid repository type
+        try {
+            HFUtils.parseHfUrl("hf://models/user/model/file.txt");
+            Assert.fail("Should throw AnalysisException for unsupported repo 
type");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("only supports 
'datasets' and 'spaces'"));
+        }
+
+        // Test empty username
+        try {
+            HFUtils.parseHfUrl("hf://datasets//repo/file.txt");
+            Assert.fail("Should throw AnalysisException for empty username");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("Failed to parse 
HuggingFace URL"));
+        }
+
+        // Test empty revision
+        try {
+            HFUtils.parseHfUrl("hf://datasets/user/repo@/file.txt");
+            Assert.fail("Should throw AnalysisException for empty revision");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("Failed to parse 
HuggingFace URL"));
+        }
+    }
+
+    @Test
+    public void testConvertHfUrlToHttpUrlExceptions() {
+        // Test null URL
+        try {
+            HFUtils.convertHfUrlToHttpUrl(null);
+            Assert.fail("Should throw AnalysisException for null URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("cannot be null or 
empty"));
+        }
+
+        // Test empty URL
+        try {
+            HFUtils.convertHfUrlToHttpUrl("");
+            Assert.fail("Should throw AnalysisException for empty URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("cannot be null or 
empty"));
+        }
+
+        // Test invalid URL
+        try {
+            HFUtils.convertHfUrlToHttpUrl("http://example.com";);
+            Assert.fail("Should throw AnalysisException for invalid URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("must start with 
'hf://'"));
+        }
+    }
+
+    @Test
+    public void testEdgeCases() throws AnalysisException {
+        // Test URL with special characters in path
+        String hfUrl1 = "hf://datasets/user/repo/path with 
spaces/file-name_123.parquet";
+        String httpUrl1 = HFUtils.convertHfUrlToHttpUrl(hfUrl1);
+        String expected1 = 
"https://huggingface.co/datasets/user/repo/resolve/main/path with 
spaces/file-name_123.parquet";
+        Assert.assertEquals(expected1, httpUrl1);
+
+        // Test URL with multiple slashes in path
+        String hfUrl2 = "hf://datasets/user/repo/path/to/deep/nested/file.txt";
+        String httpUrl2 = HFUtils.convertHfUrlToHttpUrl(hfUrl2);
+        String expected2 = 
"https://huggingface.co/datasets/user/repo/resolve/main/path/to/deep/nested/file.txt";;
+        Assert.assertEquals(expected2, httpUrl2);
+
+        // Test URL with revision containing special characters
+        String hfUrl3 = "hf://datasets/user/[email protected]/file.txt";
+        String httpUrl3 = HFUtils.convertHfUrlToHttpUrl(hfUrl3);
+        String expected3 = 
"https://huggingface.co/datasets/user/repo/resolve/feature-branch-v1.0/file.txt";;
+        Assert.assertEquals(expected3, httpUrl3);
+    }
+
+    @Test
+    public void testGlobFunctionality() throws AnalysisException {
+        // Test wildcard detection
+        Assert.assertTrue(HFUtils.containsWildcards("/path/*.parquet"));
+        Assert.assertTrue(HFUtils.containsWildcards("/path/**/train/*.csv"));
+        Assert.assertTrue(HFUtils.containsWildcards("/path/file_[abc].txt"));
+        Assert.assertTrue(HFUtils.containsWildcards("/path/file_{1,2,3}.txt"));
+        Assert.assertFalse(HFUtils.containsWildcards("/path/file.txt"));
+        Assert.assertFalse(HFUtils.containsWildcards(""));
+        Assert.assertFalse(HFUtils.containsWildcards(null));
+
+        // Test longest prefix extraction
+        Assert.assertEquals("/path", 
HFUtils.getLongestPrefixWithoutWildcards("/path/*.parquet"));
+        Assert.assertEquals("/path", 
HFUtils.getLongestPrefixWithoutWildcards("/path/**/train/*.csv"));
+        Assert.assertEquals("/path", 
HFUtils.getLongestPrefixWithoutWildcards("/path/file_[abc].txt"));
+        Assert.assertEquals("/path/to/deep", 
HFUtils.getLongestPrefixWithoutWildcards("/path/to/deep/*.txt"));
+        Assert.assertEquals("/path/file.txt", 
HFUtils.getLongestPrefixWithoutWildcards("/path/file.txt"));
+        Assert.assertEquals("", 
HFUtils.getLongestPrefixWithoutWildcards("*.txt"));
+
+        // Test glob URL validation
+        
Assert.assertTrue(HFUtils.isValidGlobUrl("hf://datasets/user/repo/path/*.parquet"));
+        
Assert.assertTrue(HFUtils.isValidGlobUrl("hf://datasets/user/repo/path/**/train/*.csv"));
+        
Assert.assertFalse(HFUtils.isValidGlobUrl("hf://datasets/user/repo/path/file.txt"));
+        Assert.assertFalse(HFUtils.isValidGlobUrl("http://example.com/*.txt";));
+        Assert.assertFalse(HFUtils.isValidGlobUrl(null));
+    }
+
+    @Test
+    public void testGlobPatternMatching() {
+        // Test basic pattern matching
+        Assert.assertTrue(HFUtils.matchGlobPattern("file.txt", "*.txt"));
+        Assert.assertTrue(HFUtils.matchGlobPattern("file.parquet", 
"*.parquet"));
+        Assert.assertTrue(HFUtils.matchGlobPattern("file_a.txt", 
"file_[abc].txt"));
+        Assert.assertFalse(HFUtils.matchGlobPattern("file_d.txt", 
"file_[abc].txt"));
+        Assert.assertFalse(HFUtils.matchGlobPattern("file.csv", "*.txt"));
+
+        // Test edge cases
+        Assert.assertFalse(HFUtils.matchGlobPattern(null, "*.txt"));
+        Assert.assertFalse(HFUtils.matchGlobPattern("file.txt", null));
+        Assert.assertFalse(HFUtils.matchGlobPattern("", "*.txt"));
+    }
+
+    @Test
+    public void testPathSplitting() {
+        List<String> components1 = HFUtils.splitPath("/path/to/file.txt");
+        Assert.assertEquals(3, components1.size());
+        Assert.assertEquals("path", components1.get(0));
+        Assert.assertEquals("to", components1.get(1));
+        Assert.assertEquals("file.txt", components1.get(2));
+
+        List<String> components2 = HFUtils.splitPath("path/to/file.txt");
+        Assert.assertEquals(3, components2.size());
+        Assert.assertEquals("path", components2.get(0));
+
+        List<String> components3 = HFUtils.splitPath("");
+        Assert.assertEquals(0, components3.size());
+
+        List<String> components4 = HFUtils.splitPath(null);
+        Assert.assertEquals(0, components4.size());
+    }
+
+    @Test
+    public void testAdvancedPatternMatching() {
+        // Test ** recursive matching
+        List<String> pathComponents1 = 
HFUtils.splitPath("path/to/deep/file.txt");
+        List<String> patternComponents1 = 
HFUtils.splitPath("path/**/file.txt");
+        Assert.assertTrue(HFUtils.matchPathComponents(pathComponents1, 
patternComponents1));
+
+        List<String> pathComponents2 = HFUtils.splitPath("path/file.txt");
+        List<String> patternComponents2 = 
HFUtils.splitPath("path/**/file.txt");
+        Assert.assertTrue(HFUtils.matchPathComponents(pathComponents2, 
patternComponents2));
+
+        List<String> pathComponents3 = HFUtils.splitPath("different/file.txt");
+        List<String> patternComponents3 = 
HFUtils.splitPath("path/**/file.txt");
+        Assert.assertFalse(HFUtils.matchPathComponents(pathComponents3, 
patternComponents3));
+
+        // Test single * matching
+        List<String> pathComponents4 = 
HFUtils.splitPath("path/train/file.txt");
+        List<String> patternComponents4 = HFUtils.splitPath("path/*/file.txt");
+        Assert.assertTrue(HFUtils.matchPathComponents(pathComponents4, 
patternComponents4));
+
+        List<String> pathComponents5 = 
HFUtils.splitPath("path/to/deep/file.txt");
+        List<String> patternComponents5 = HFUtils.splitPath("path/*/file.txt");
+        Assert.assertFalse(HFUtils.matchPathComponents(pathComponents5, 
patternComponents5));
+    }
+
+    @Test
+    public void testGlobExpansion() throws AnalysisException {
+        // Test non-glob URL (should return single result)
+        String nonGlobUrl = "hf://datasets/user/repo/path/file.txt";
+        List<String> result1 = HFUtils.expandGlob(nonGlobUrl);
+        Assert.assertEquals(1, result1.size());
+        
Assert.assertEquals("https://huggingface.co/datasets/user/repo/resolve/main/path/file.txt";,
 result1.get(0));
+
+        // Test glob URL validation
+        String globUrl1 = "hf://datasets/user/repo/path/*.parquet";
+        Assert.assertTrue(HFUtils.isValidGlobUrl(globUrl1));
+
+        String globUrl2 = "hf://datasets/user/repo/path/*.csv";
+        Assert.assertTrue(HFUtils.isValidGlobUrl(globUrl2));
+
+        // Note: Real glob expansion tests would require actual HuggingFace 
API calls
+        // The actual expansion will fail without real API access, but URL 
parsing works
+    }
+
+    @Test
+    public void testGlobExpansionExceptions() throws AnalysisException {
+        // Test null URL
+        try {
+            HFUtils.expandGlob(null);
+            Assert.fail("Should throw AnalysisException for null URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("cannot be null or 
empty"));
+        }
+
+        // Test empty URL
+        try {
+            HFUtils.expandGlob("");
+            Assert.fail("Should throw AnalysisException for empty URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("cannot be null or 
empty"));
+        }
+
+        // Test invalid URL
+        try {
+            HFUtils.expandGlob("http://example.com/*.txt";);
+            Assert.fail("Should throw AnalysisException for invalid URL");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains("must start with 
'hf://'"));
+        }
+
+        List<String> res = 
HFUtils.expandGlob("hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv");
+        Assert.assertEquals(1, res.size());
+        
Assert.assertEquals("https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv";,
+                res.get(0));
+
+        ParsedHFUrl parsed = 
HFUtils.parseHfUrl("hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv");
+        Assert.assertEquals("/prompts.csv", parsed.getPath());
+
+        res = 
HFUtils.expandGlob("hf://datasets/fka/awesome-chatgpt-prompts/blob/main/*");
+        Assert.assertEquals(3, res.size());
+        Assert.assertTrue(res.contains(
+                
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv";));
+        Assert.assertTrue(res.contains(
+                
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/.gitattributes";));
+        Assert.assertTrue(res.contains(
+                
"https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/README.md";));
+    }
+}
+
diff --git a/regression-test/data/external_table_p0/tvf/test_http_tvf.out 
b/regression-test/data/external_table_p0/tvf/test_http_tvf.out
index c8b59abe597..891ce36d02e 100644
--- a/regression-test/data/external_table_p0/tvf/test_http_tvf.out
+++ b/regression-test/data/external_table_p0/tvf/test_http_tvf.out
@@ -63,26 +63,7 @@ c13  text    Yes     false   \N      NONE
 2500
 
 -- !sql06 --
-20     2023-08-17      0       -5      18158   784479801       
1485484354598941738     -6632681928222776815    9708.4307       
-330432620.706069       \\N     \\N     2022-09-15 21:40:55     2023-02-23      
2023-08-13 21:31:54     O       X       
2pYmX2vAhfEEHZZYPsgAmda1G7otnwx5TmUC879FPhDeIjvWI79ksBZpfFG2gp7jhCSbpZiecKGklB5SvG8tm31i5SUqe1xrWgLt4HSq7lMJWp75tx2kxD7pRIOpn
   \\N
-21     2023-08-18      0       63      -27847  -35409596       
8638201997392767650     4919963231735304178     -23382.541      
-1803403621.4263129     -22009767       \\N     2023-03-31 10:56:14     
2023-01-20      2023-02-18 13:37:52     N       T       
PSiFwUEx3eVFNtjlnQ70YkgZNvKrGmQ2DN5K9yYHiSdFWeEDB1UpL3Frt8z1kEAIWRDWqXZuyi      
\\N
-31     2023-08-27      0       17      -18849  1728109133      
3266501886640700374     527195452623418935      -24062.328      
-1514348021.262435      \\N     \\N     2022-10-07 03:24:23     2022-09-25      
\\N     0       8       
yKMiAntORoRa8svnMfcxlOPwwND1m5s2fdS26Xu6cfs6HK5SAibqIp9h8sZcpjHy4       \\N
-41     2023-08-27      1       -104    22750   \\N     8527773271030840740     
5554497317268279215     -5296.8281      -1715646888.01304       \\N     \\N     
2022-12-02 17:56:44     2022-10-12      2023-02-19 07:02:54     V               
E9GzQdTwX1ITUQz27IVznAs6Ca4WwprKk6Odjs6SH75D2F1089QiY3HQ52LXRD1V6xAWjhLE2hWgW3EdHuAOnUDVrb5V
    \\N
-49     2023-08-08      0       \\N     16275   -2144851675     
-2303421957908954634    -46526938720058765      -13141.143      -686632233.2302 
\\N     \\N     2022-09-01 00:16:01     2023-03-25      2022-09-07 14:59:03     
s               yvuILR2iNxfe8RRml       \\N
-50     2023-08-24      1       15      14403   \\N     -6418906115745394180    
9205303779366462513     -4331.5488      -615112179.557648       \\N     \\N     
2022-12-29 02:27:20     2023-06-01      2023-08-12 04:50:04     a               
eCl38sztIvBQvGvGKyYZmyMXy9vIJx197iu3JwP9doJGcrYUl9Uova0rz4iCCgrjlAiZU18Fs9YtCq830nhM
    \\N
-50     2023-08-06      1       109     -6330   1479023892      
-8630800697573159428    -1645095773540208759    17880.961       
-1453844792.0139489     \\N     \\N     2022-09-22 02:03:21     2023-05-14      
2023-03-25 02:18:34     m               
JKnIgXvGVidGiWl9YRSi3mFI7wHKt1sBpWSadKF8VX3LAuElm4sdc9gtxREaUr57oikSYlU8We8h1MWqQlYNiJObl
       \\N
-57     2023-08-19      1       2       -25462  -74112029       
6458082754318544493     -7910671781690629051    -15205.859      
-306870797.484914       \\N     \\N     2023-07-10 18:39:10     2023-02-12      
2023-01-27 07:26:06     y               Xi9nDVrLv8m6AwEpUxmtzFAuK48sQ   \\N
-58     2023-08-22      \\N     0       -18231  1832867360      
6997858407575297145     2480714305422728023     -5450.4888      
1475901032.138386       \\N     \\N     2023-02-02 05:13:24     2022-09-18      
2023-04-23 10:51:15     k               
LdFXF7Kmfzgmnn2R6zLsXdmi3A2cLBLq4G4WDVNDhxvH7dYH8Kga2WA47uSIxp6NSrwPSdw0ssB1TS8RFJTDJAB0Uba3e05NL2Aiw0ja
        \\N
-60     2023-08-27      0       -52     -2338   -757056972      
1047567408607120856     6541476642780646552     6614.0894       
-1204448798.5178549     \\N     \\N     2022-12-29 14:47:30     2022-09-24      
2023-08-01 12:41:59     O       F       
RM4F1Ke7lkcnuxF2nK0j9VBW3MDcgyHR4pseBjtFnqS6GUkVFuzF6u3Cp9Nv7ab0O6UYrpP4DhU     
\\N
-62     2023-08-21      0       81      20302   -200761532      
6365479976421007608     \\N     -29916.533      1709141750.8284781      \\N     
\\N     2023-05-04 01:14:51     2022-09-17      2022-12-04 19:30:09     d       
v       
BKWy9dTNg1aZW7ancEJAmEDOPK5TwFsNSHbI78emu9gymeIlx5NoLmyii0QAqdzRvSQPZKiqKkwInGCTIBnK1yYkK7zD
    \\N
-65     2023-08-09      0       94      31514   814994517       
-297697460695940343     734910652450318597      -13061.892      62750847.041706 
-9808654        \\N     2023-08-14 22:01:27     2023-05-19      2022-11-13 
13:44:28     V               
aGeMsI24O12chGlP5ak0AHghAz7bu5MargJBStHnt0yMnChH0JnfYhsfH1u59XIHkJKMsHYktBqORkGlovu8V47E74KeFpaqxn5yLyXfDbhhzUKf
        \\N
-66     2023-08-15      1       -91     28378   609923317       
4872185586197131212     1207709464099378591     \\N     -1863683325.9851229     
\\N     \\N     2022-09-24 10:39:23     2022-09-24      2022-10-16 18:36:43     
Y       z       
AI1BSPQdKiHJiQH1kguyLSWsDXkC7zwy7PwgWnyGSaa9tBKRex8vHBdxg2QSKZKL2mV2lHz7iI1PnsTd4MXDcIKhqiHyPuQPt2tEtgt0UgF6
    \\N
-68     2023-08-23      1       -73     20117   1737338128      
795638676048937749      -5551546237562433901    -30627.039      68589475.684545 
\\N     \\N     2022-12-28 20:26:51     2022-10-04      2023-07-30 00:20:06     
y               
keZ3JlWWpdnPBejf0cuiCQCVBBTd5gjvO08NVdcAFewqL7nRT4N9lnvSU6pWmletA5VbPQCeQapJdcnQCHfZUDCf4ulCnczyqr7SGrbGRT0XYcd7iktKM
   \\N
-8      2023-08-14      1       109     -31573  -1362465190     
3990845741226497177     2732763251146840270     -25698.553      
1312831962.5678179      \\N     \\N     2023-03-07 14:13:19     2022-10-18      
2023-07-16 05:03:13     D               
PBn1wa6X8WneZYLMac11zzyhGl7tPXB5XgjmOV8L6uav9ja5oY433ktb2yhyQQIqBveZPkme        
\\N
-80     2023-08-18      0       -18     -8971   679027874       
6535956962935330265     3960889045799757165     -13219.76       
1187161924.505394       \\N     \\N     2023-03-11 07:40:00     2022-11-29      
2023-01-14 07:24:07     N       D       
3Nhx6xX1qdwaq7lxwLRSKMtJFbC03swWv12mpySSVysH3igGZTiGPuKMsYW7HAkf6CWc7c0nzqDsjuH3FYVMNCWRmfxMrmY8rykQCC4Ve
       \\N
-81     2023-08-23      0       106     11492   -667795397      
4480250461471356146     -5346660566234294101    9082.75 385167225.902608        
\\N     \\N     2023-03-20 03:33:16     2022-11-24      2023-02-16 18:29:41     
G       9       Lk3eNVQNjucbekD1rZmUlGPiXS5JvcWr2LQzRU8GSGIbSag \\N
-85     2023-08-11      1       -7      24304   -2043877415     
-2024144417867729183    \\N     5363.0244       -578615669.042831       \\N     
\\N     2023-07-15 01:07:41     2023-08-13      2023-01-20 11:57:48     i       
        WQ9dh9ajPu0y    \\N
-90     2023-08-27      1       22      16456   -1476824962     
-3279894870153540825    8990195191470116763     26651.906       
206860148.942546        \\N     \\N     2022-10-07 03:11:03     2023-03-18      
2023-04-15 00:38:33     T       L       
QW0GQ3GoMtHgxPQOWGfVaveynahNpsNs09siMFA1OtO6QEDBQTdivmGyq7bFzejAqwbbVQQpREAmeLjcFSXLnQuou2KbwYD
 \\N
-91     2023-08-27      1       90      2465    702240964       
6373830997821598984     305860046137409400      15991.356       
1599972327.386147       \\N     \\N     2023-04-26 19:31:10     2023-07-21      
\\N     2               
B7YKYBYT8w0YC926bZ8Yz1VzyiWw2NWDAiTlEoPVyz9AXGti2Npg1FxWqWk4hEaALw0ZBSuiAIPj41lq36g5QRpPmAjNPK
  \\N
+20
 
 -- !sql07 --
 k00    text    Yes     false   \N      NONE
@@ -169,3 +150,27 @@ m  map<text,text>  Yes     false   \N      NONE
 4      \N      \N      123.123456789876        12      \N      123
 5      1.1234  12.123456       \N      \N      1234.123456789  \N
 
+-- !sql15 --
+204
+
+-- !sql16 --
+204
+
+-- !sql17 --
+c1     text    Yes     false   \N      NONE
+
+-- !sql18 --
+{"description": "Large Movie Review Dataset.\\nThis is a dataset for binary 
sentiment classification containing substantially more data than previous 
benchmark datasets. We provide a set of 25,000 highly polar movie reviews for 
training, and 25,000 for testing. There is additional unlabeled data for use as 
well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\\n  author    = 
{Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  
and  Ng, Andrew Y.  and   [...]
+
+-- !sql19 --
+!!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this 
film had an ending TO spoil... I only started watching it in the middle, after 
Matt had gotten into Sarah's body, but then I became fascinated by the 
bizarreness of the plot, even for a Channel 5 movie... and couldn't possibly 
see how Matt wld end up happy. What about his fiancee? At one stage looked like 
he was gonna get with his best friend, surely icky and wrong... and then the 
whole 'oggi oggi oggi' thing  [...]
+
+-- !sql20 --
+!!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this 
film had an ending TO spoil... I only started watching it in the middle, after 
Matt had gotten into Sarah's body, but then I became fascinated by the 
bizarreness of the plot, even for a Channel 5 movie... and couldn't possibly 
see how Matt wld end up happy. What about his fiancee? At one stage looked like 
he was gonna get with his best friend, surely icky and wrong... and then the 
whole 'oggi oggi oggi' thing  [...]
+
+-- !sql21 --
+A Turkish Bath sequence in a film noir located in New York in the 50's, 
that must be a hint at something ! Something that curiously, in all the 
previous comments, no one has pointed out , but seems to me essential to the 
understanding of this movie <br /><br />the Turkish Baths sequence: a back 
street at night, the entrance of a sleazy sauna, and Scalise wrapped in a 
sheet, getting his thighs massaged. Steve, the masseur is of the young rough 
boxer ( Beefcake!) type , and another guy [...]
+
+-- !sql21 --
+!!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this 
film had an ending TO spoil... I only started watching it in the middle, after 
Matt had gotten into Sarah's body, but then I became fascinated by the 
bizarreness of the plot, even for a Channel 5 movie... and couldn't possibly 
see how Matt wld end up happy. What about his fiancee? At one stage looked like 
he was gonna get with his best friend, surely icky and wrong... and then the 
whole 'oggi oggi oggi' thing  [...]
+
diff --git a/regression-test/suites/external_table_p0/tvf/test_http_tvf.groovy 
b/regression-test/suites/external_table_p0/tvf/test_http_tvf.groovy
index 3cbd35e51a7..133b992e8c4 100644
--- a/regression-test/suites/external_table_p0/tvf/test_http_tvf.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_http_tvf.groovy
@@ -86,12 +86,12 @@ suite("test_http_tvf", "p2") {
 
     // json
     qt_sql06 """
-        select * from
+        select count(*) from
         http(
             "uri" = 
"https://raw.githubusercontent.com/apache/doris/refs/heads/master/regression-test/data/load_p0/stream_load/basic_data.json";,
             "format" = "json",
             "strip_outer_array" = true
-        ) order by k00;
+        );
     """
 
     qt_sql07 """
@@ -177,4 +177,72 @@ suite("test_http_tvf", "p2") {
             "http.max.request.size.bytes" = "2000"
         ) order by id;
     """
+
+    // hf
+    qt_sql15 """
+        select count(*) from
+        http(
+            "uri" = 
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/prompts.csv",
+            "format" = "csv"
+        );
+    """
+
+    qt_sql16 """
+        select count(*) from
+        http(
+            "uri" = 
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/*.csv",
+            "format" = "csv"
+        );
+    """
+    
+    qt_sql17 """
+        desc function
+        http(
+            "uri" = 
"hf://datasets/fka/awesome-chatgpt-prompts/blob/main/*.csv",
+            "format" = "csv"
+        );
+    """
+
+    // branch
+    qt_sql18 """
+        select * from
+        http(
+            "uri" = "hf://datasets/stanfordnlp/imdb@script/dataset_infos.json",
+            "format" = "json"
+        );
+    """
+
+    qt_sql19 """
+        select * from
+        http(
+            "uri" = 
"hf://datasets/stanfordnlp/imdb@main/plain_text/test-00000-of-00001.parquet",
+            "format" = "parquet"
+        ) order by text limit 1;
+    """
+
+    // wildcard
+    qt_sql20 """
+        select * from
+        http(
+            "uri" = 
"hf://datasets/stanfordnlp/imdb@main/*/test-00000-of-00001.parquet",
+            "format" = "parquet"
+        ) order by text limit 1;
+    """
+
+    qt_sql21 """
+        select * from
+        http(
+            "uri" = "hf://datasets/stanfordnlp/imdb@main/*/*.parquet",
+            "format" = "parquet"
+        ) order by text limit 1;
+    """
+
+    qt_sql21 """
+        select * from
+        http(
+            "uri" = 
"hf://datasets/stanfordnlp/imdb@main/**/test-00000-of-0000[1].parquet",
+            "format" = "parquet"
+        ) order by text limit 1;
+    """
 }
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [feat](tvf) support huggingface with http tvf (#58049)

Reply via email to