This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new a7e09412a2 fix(spark): return input string for PATH/FILE on schemeless 
URLs in `parse_url` (#20506)
a7e09412a2 is described below

commit a7e09412a2b7e688e87d87b580884a8dc938234f
Author: David López <[email protected]>
AuthorDate: Mon Mar 16 21:46:50 2026 +0100

    fix(spark): return input string for PATH/FILE on schemeless URLs in 
`parse_url` (#20506)
    
    ## Which issue does this PR close?
    
    - NA
    
    ## Rationale for this change
    
    Spark's `java.net.URI` treats schemeless strings (e.g. `'notaurl'`) as
    relative URIs where the entire input becomes the path component. The
    Rust `url` crate rejects these with `RelativeUrlWithoutBase`, and the
    current implementation maps all such errors to `NULL` — but Spark
    returns the input string for `PATH` and `FILE`.
    
    ## What changes are included in this PR?
    
    - In `parse_url.rs`, when catching `RelativeUrlWithoutBase` for
    schemeless URLs, return the input string for `PATH` and `FILE` parts
    instead of `NULL`
    - Updated unit tests and sqllogictests for both `parse_url` and
    `try_parse_url`
    
    ## Are these changes tested?
    
    Yes:
    - Unit test `test_parse_schemeless_url` covers all 8 URL parts against a
    schemeless input
    - sqllogictest coverage in `parse_url.slt` and `try_parse_url.slt`
    
    ## Are there any user-facing changes?
    
    Yes — `parse_url('notaurl', 'PATH')` and `parse_url('notaurl', 'FILE')`
    now return `'notaurl'` instead of `NULL`, matching Spark behavior.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/spark/src/function/url/parse_url.rs     | 126 ++++++++++++++++++-
 .../test_files/spark/url/parse_url.slt             | 133 +++++++++++++++++++++
 .../test_files/spark/url/try_parse_url.slt         | 133 +++++++++++++++++++++
 3 files changed, 387 insertions(+), 5 deletions(-)

diff --git a/datafusion/spark/src/function/url/parse_url.rs 
b/datafusion/spark/src/function/url/parse_url.rs
index 7beb02f775..50591fb25e 100644
--- a/datafusion/spark/src/function/url/parse_url.rs
+++ b/datafusion/spark/src/function/url/parse_url.rs
@@ -84,7 +84,35 @@ impl ParseUrl {
         let url: std::result::Result<Url, ParseError> = Url::parse(value);
         if let Err(ParseError::RelativeUrlWithoutBase) = url {
             return if !value.contains("://") {
-                Ok(None)
+                // Schemeless URLs are treated as relative URIs (like 
java.net.URI).
+                // Manually parse path, query, and fragment components.
+                let (without_fragment, fragment) = match value.split_once('#') 
{
+                    Some((before, frag)) => (before, Some(frag)),
+                    None => (value, None),
+                };
+                let (path, query) = match without_fragment.split_once('?') {
+                    Some((p, q)) => (p, Some(q)),
+                    None => (without_fragment, None),
+                };
+                Ok(match part {
+                    "PATH" => Some(path.to_string()),
+                    "QUERY" => match key {
+                        None => query.map(String::from),
+                        Some(key) => query.and_then(|q| {
+                            q.split('&')
+                                .filter_map(|pair| pair.split_once('='))
+                                .find(|(k, _)| *k == key)
+                                .map(|(_, v)| v.to_string())
+                        }),
+                    },
+                    "REF" => fragment.map(String::from),
+                    "FILE" => {
+                        // FILE = path + query (without fragment)
+                        Some(without_fragment.to_string())
+                    }
+                    // HOST, PROTOCOL, AUTHORITY, USERINFO → NULL
+                    _ => None,
+                })
             } else {
                 Err(exec_datafusion_err!(
                     "The url is invalid: {value}. Use `try_parse_url` to 
tolerate invalid URL and return NULL instead. SQLSTATE: 22P02"
@@ -199,6 +227,7 @@ pub fn spark_handled_parse_url(
                     as_string_array(part)?,
                     as_string_array(key)?,
                     handler_err,
+                    true,
                 )
             }
             (DataType::Utf8View, DataType::Utf8View, DataType::Utf8View) => {
@@ -207,6 +236,7 @@ pub fn spark_handled_parse_url(
                     as_string_view_array(part)?,
                     as_string_view_array(key)?,
                     handler_err,
+                    true,
                 )
             }
             (DataType::LargeUtf8, DataType::LargeUtf8, DataType::LargeUtf8) => 
{
@@ -215,6 +245,7 @@ pub fn spark_handled_parse_url(
                     as_large_string_array(part)?,
                     as_large_string_array(key)?,
                     handler_err,
+                    true,
                 )
             }
             _ => exec_err!(
@@ -240,6 +271,7 @@ pub fn spark_handled_parse_url(
                     as_string_array(part)?,
                     &key,
                     handler_err,
+                    false,
                 )
             }
             (DataType::Utf8View, DataType::Utf8View) => {
@@ -248,6 +280,7 @@ pub fn spark_handled_parse_url(
                     as_string_view_array(part)?,
                     &key,
                     handler_err,
+                    false,
                 )
             }
             (DataType::LargeUtf8, DataType::LargeUtf8) => {
@@ -256,6 +289,7 @@ pub fn spark_handled_parse_url(
                     as_large_string_array(part)?,
                     &key,
                     handler_err,
+                    false,
                 )
             }
             _ => exec_err!(
@@ -272,6 +306,7 @@ fn process_parse_url<'a, A, B, C, T>(
     part_array: &'a B,
     key_array: &'a C,
     handle: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+    has_key_arg: bool,
 ) -> Result<ArrayRef>
 where
     &'a A: StringArrayType<'a>,
@@ -284,7 +319,11 @@ where
         .zip(part_array.iter())
         .zip(key_array.iter())
         .map(|((url, part), key)| {
-            if let (Some(url), Some(part), key) = (url, part, key) {
+            // Spark returns NULL when the third argument is explicitly NULL
+            if has_key_arg && key.is_none() {
+                return Ok(None);
+            }
+            if let (Some(url), Some(part)) = (url, part) {
                 handle(ParseUrl::parse(url, part, key))
             } else {
                 Ok(None)
@@ -357,9 +396,86 @@ mod tests {
     }
 
     #[test]
-    fn test_parse_malformed_url_returns_error() -> Result<()> {
-        let got = ParseUrl::parse("notaurl", "HOST", None)?;
-        assert_eq!(got, None);
+    fn test_parse_schemeless_url() -> Result<()> {
+        // Spark's java.net.URI treats schemeless strings as relative URIs.
+        // Simple schemeless string: no query, no fragment.
+        assert_eq!(
+            ParseUrl::parse("notaurl", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl", "FILE", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(ParseUrl::parse("notaurl", "HOST", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "PROTOCOL", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "REF", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "AUTHORITY", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "USERINFO", None)?, None);
+
+        // Schemeless URL with query string
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "FILE", None)?,
+            Some("notaurl?key=value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", None)?,
+            Some("key=value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", Some("key"))?,
+            Some("value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", Some("missing"))?,
+            None
+        );
+        assert_eq!(ParseUrl::parse("notaurl?key=value", "HOST", None)?, None);
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "PROTOCOL", None)?,
+            None
+        );
+
+        // Schemeless URL with fragment
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "REF", None)?,
+            Some("reference".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "FILE", None)?,
+            Some("notaurl".to_string())
+        );
+
+        // Schemeless URL with both query and fragment
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", None)?,
+            Some("a=1&b=2".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", Some("b"))?,
+            Some("2".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "REF", None)?,
+            Some("frag".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "FILE", None)?,
+            Some("notaurl?a=1&b=2".to_string())
+        );
         Ok(())
     }
 
diff --git a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt 
b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
index f2dc55f755..7a5051d50e 100644
--- a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
@@ -140,6 +140,96 @@ SELECT parse_url('notaurl', 'host');
 ----
 NULL
 
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
 query T
 SELECT parse_url('https://example.com', 'PATH');
 ----
@@ -175,3 +265,46 @@ SELECT parse_url();
 
 query error DataFusion error: Execution error: The url is invalid: inva 
lid://spark\.apache\.org/path\?query=1\. Use `try_parse_url` to tolerate 
invalid URL and return NULL instead\. SQLSTATE: 22P02
 SELECT parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third 
arg is NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT parse_url(NULL, NULL, NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt 
b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
index 403747c63c..a0e42a1648 100644
--- a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
@@ -91,6 +91,96 @@ SELECT try_parse_url('notaurl', 'host');
 ----
 NULL
 
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT try_parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT try_parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT try_parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT try_parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT try_parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
 query T
 SELECT try_parse_url('https://example.com', 'PATH');
 ----
@@ -125,3 +215,46 @@ query T
 SELECT try_parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
 ----
 NULL
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT try_parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT try_parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT try_parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third 
arg is NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT try_parse_url(NULL, NULL, NULL);
+----
+NULL


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to