davidlghellin commented on code in PR #22890: URL: https://github.com/apache/datafusion/pull/22890#discussion_r3391141423
########## datafusion/sqllogictest/test_files/spark/string/levenshtein.slt: ########## @@ -21,12 +21,224 @@ # For more information, please see: # https://github.com/apache/datafusion/issues/15914 -## Original Query: SELECT levenshtein('kitten', 'sitting'); -## PySpark 3.5.5 Result: {'levenshtein(kitten, sitting)': 3, 'typeof(levenshtein(kitten, sitting))': 'int', 'typeof(kitten)': 'string', 'typeof(sitting)': 'string'} -#query -#SELECT levenshtein('kitten'::string, 'sitting'::string); - -## Original Query: SELECT levenshtein('kitten', 'sitting', 2); -## PySpark 3.5.5 Result: {'levenshtein(kitten, sitting, 2)': -1, 'typeof(levenshtein(kitten, sitting, 2))': 'int', 'typeof(kitten)': 'string', 'typeof(sitting)': 'string', 'typeof(2)': 'int'} -#query -#SELECT levenshtein('kitten'::string, 'sitting'::string, 2::int); +## ── Basic usage ───────────────────────────────────────────── + +## Basic distance +query I +SELECT levenshtein('kitten', 'sitting'); +---- +3 + +## Identical strings +query I +SELECT levenshtein('hello', 'hello'); +---- +0 + +## Empty string vs non-empty +query I +SELECT levenshtein('', 'abc'); +---- +3 + +## Both empty strings +query I +SELECT levenshtein('', ''); +---- +0 + +## Single character difference +query I +SELECT levenshtein('abc', 'adc'); +---- +1 + +## ── Threshold (3-argument form) ───────────────────────────── + +## Distance within threshold +query I +SELECT levenshtein('kitten', 'sitting', 4); +---- +3 + +## Distance exceeds threshold → returns -1 +query I +SELECT levenshtein('kitten', 'sitting', 2); +---- +-1 + +## Distance equals threshold (boundary) → returns distance +query I +SELECT levenshtein('kitten', 'sitting', 3); +---- +3 + +## Threshold zero with different strings +query I +SELECT levenshtein('abc', 'def', 0); +---- +-1 + +## Threshold zero with identical strings +query I +SELECT levenshtein('abc', 'abc', 0); +---- +0 + +## ── Null handling ─────────────────────────────────────────── + +## First argument null +query I +SELECT levenshtein(CAST(NULL AS STRING), 'hello'); +---- +NULL + +## Second argument null +query I +SELECT levenshtein('hello', CAST(NULL AS STRING)); +---- +NULL + +## Both arguments null +query I +SELECT levenshtein(CAST(NULL AS STRING), CAST(NULL AS STRING)); +---- +NULL + +## Null threshold +query I +SELECT levenshtein('kitten', 'sitting', CAST(NULL AS INT)); +---- +NULL + +## ── Unicode and special characters ────────────────────────── + +## Unicode strings +query I +SELECT levenshtein('café', 'cafe'); +---- +1 + +## Strings with spaces +query I +SELECT levenshtein('hello world', 'hello world!'); +---- +1 + +## ── Column expressions ────────────────────────────────────── + +## Levenshtein on columns from inline table +query I +SELECT levenshtein(s1, s2) AS result FROM VALUES ('abc', 'abc'), ('abc', 'def'), ('kitten', 'sitting') AS t(s1, s2); +---- +0 +3 +3 + +## Threshold on columns from inline table +query I +SELECT levenshtein(s1, s2, 2) AS result FROM VALUES ('abc', 'abc'), ('abc', 'def'), ('kitten', 'sitting') AS t(s1, s2); +---- +0 +-1 +-1 + +## ── Per-row threshold ─────────────────────────────────────── + +## Different threshold per row +query I +SELECT levenshtein(s1, s2, t) AS result FROM VALUES ('abc', 'def', 2), ('abc', 'def', 5), ('abc', 'def', 3) AS t(s1, s2, t); Review Comment: Renamed alias `t → v` and column `t → threshold` in the four affected cases. Failures now point at unambiguous names. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
