martin-g commented on code in PR #20506:
URL: https://github.com/apache/datafusion/pull/20506#discussion_r2846984418
##########
datafusion/spark/src/function/url/parse_url.rs:
##########
@@ -357,9 +360,23 @@ mod tests {
}
#[test]
- fn test_parse_malformed_url_returns_error() -> Result<()> {
- let got = ParseUrl::parse("notaurl", "HOST", None)?;
- assert_eq!(got, None);
+ fn test_parse_schemeless_url() -> Result<()> {
+ // Spark's java.net.URI treats schemeless strings as relative URIs
+ // where the entire input becomes the path component
+ assert_eq!(
+ ParseUrl::parse("notaurl", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl", "FILE", None)?,
+ Some("notaurl".to_string())
+ );
Review Comment:
```suggestion
assert_eq!(
ParseUrl::parse("notaurl", "FILE", None)?,
Some("notaurl".to_string())
);
assert_eq!(
ParseUrl::parse("notaurl?key=value", "FILE", None)?,
Some("notaurl?key=value".to_string())
);
```
Spark returns the whole input for FILE:
```
spark-sql (default)> select parse_url('notaurl?key=value', 'FILE');
notaurl?key=value
Time taken: 0.015 seconds, Fetched 1 row(s)
```
##########
datafusion/spark/src/function/url/parse_url.rs:
##########
@@ -357,9 +360,23 @@ mod tests {
}
#[test]
- fn test_parse_malformed_url_returns_error() -> Result<()> {
- let got = ParseUrl::parse("notaurl", "HOST", None)?;
- assert_eq!(got, None);
+ fn test_parse_schemeless_url() -> Result<()> {
+ // Spark's java.net.URI treats schemeless strings as relative URIs
+ // where the entire input becomes the path component
+ assert_eq!(
+ ParseUrl::parse("notaurl", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl", "FILE", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(ParseUrl::parse("notaurl", "HOST", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "PROTOCOL", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
Review Comment:
```suggestion
assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
assert_eq!(ParseUrl::parse("notaurl?key=value", "QUERY", None)?,
Some("key=value".to_string()));
```
Spark:
```
spark-sql (default)> select parse_url('notaurl?key=value', 'QUERY');
key=value
Time taken: 0.027 seconds, Fetched 1 row(s)
```
##########
datafusion/spark/src/function/url/parse_url.rs:
##########
@@ -357,9 +360,23 @@ mod tests {
}
#[test]
- fn test_parse_malformed_url_returns_error() -> Result<()> {
- let got = ParseUrl::parse("notaurl", "HOST", None)?;
- assert_eq!(got, None);
+ fn test_parse_schemeless_url() -> Result<()> {
+ // Spark's java.net.URI treats schemeless strings as relative URIs
+ // where the entire input becomes the path component
+ assert_eq!(
+ ParseUrl::parse("notaurl", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
Review Comment:
```suggestion
assert_eq!(
ParseUrl::parse("notaurl", "PATH", None)?,
Some("notaurl".to_string())
);
assert_eq!(
ParseUrl::parse("notaurl?key=value", "PATH", None)?,
Some("notaurl".to_string())
);
```
Spark returns just the PATH part, without the Query String:
```
spark-sql (default)> select parse_url('notaurl?key=value', 'PATH');
notaurl
Time taken: 0.03 seconds, Fetched 1 row(s)
```
##########
datafusion/sqllogictest/test_files/spark/url/parse_url.slt:
##########
@@ -175,3 +196,46 @@ SELECT parse_url();
query error DataFusion error: Execution error: The url is invalid: inva
lid://spark\.apache\.org/path\?query=1\. Use `try_parse_url` to tolerate
invalid URL and return NULL instead\. SQLSTATE: 22P02
SELECT parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part)
+query T
+SELECT parse_url('https://example.com/path?query=1', 'QUERY', NULL);
Review Comment:
Spark behaves differently here too:
If the third param is NULL then it returns NULL:
```
spark-sql (default)> SELECT parse_url('https://example.com/path?query=1',
'QUERY', NULL);
NULL
Time taken: 0.014 seconds, Fetched 1 row(s)
```
Without the third parameter it returns the query string:
```
spark-sql (default)> SELECT parse_url('https://example.com/path?query=1',
'QUERY');
query=1
Time taken: 0.018 seconds, Fetched 1 row(s)
```
##########
datafusion/sqllogictest/test_files/spark/url/parse_url.slt:
##########
@@ -140,6 +140,27 @@ SELECT parse_url('notaurl', 'host');
----
NULL
+# Schemeless URLs: PATH and FILE return the input string (Spark java.net.URI
behavior)
Review Comment:
Please see my comments for the unit tests and apply them here too.
##########
datafusion/spark/src/function/url/parse_url.rs:
##########
@@ -357,9 +360,23 @@ mod tests {
}
#[test]
- fn test_parse_malformed_url_returns_error() -> Result<()> {
- let got = ParseUrl::parse("notaurl", "HOST", None)?;
- assert_eq!(got, None);
+ fn test_parse_schemeless_url() -> Result<()> {
+ // Spark's java.net.URI treats schemeless strings as relative URIs
+ // where the entire input becomes the path component
+ assert_eq!(
+ ParseUrl::parse("notaurl", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl", "FILE", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(ParseUrl::parse("notaurl", "HOST", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "PROTOCOL", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "REF", None)?, None);
Review Comment:
```suggestion
assert_eq!(ParseUrl::parse("notaurl", "REF", None)?, None);
assert_eq!(ParseUrl::parse("notaurl#reference", "REF", None)?,
Some("reference".to_string()));
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]