GideonPotok commented on PR #45738:
URL: https://github.com/apache/spark/pull/45738#issuecomment-2034795190
@uros-db what do you think of it? Anything else you want me to test for? I
also did some ad-hoc testing of these functions in spark shell and pyspark
shell, all looks good..
```
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 4.0.0-SNAPSHOT
/_/
Using Scala version 2.13.13 (OpenJDK 64-Bit Server VM, Java 17.0.10)
>....
| import org.apache.spark.sql._
| import org.apache.spark.sql.functions._
| import org.apache.spark.unsafe.types.UTF8String
| val df = spark.range(1000 * 1000).toDF("id")
|
| val dff = df.withColumn("random_string",
lit(UTF8String.fromString(Random.nextString(25))))
| dff.show(1, 200, true)
| val dfff = dff.withColumn("my_substring",
substring(col("random_string"), 5, 5))
| .withColumn("my_leftstring", left(col("random_string"), lit(5)))
| .withColumn("my_rightstring", right(col("random_string"), lit(5)))
| .withColumn("my_concat", concat(col("my_substring"),
col("my_leftstring"), col("my_rightstring")))
|
| dfff.show(1, 50, true)
-RECORD 0----------------------------------------------------
id | 0
random_string | 箲䨂䡶栣䌳벣ሰ‸ꠂ昊룃쉳믠䭐沂各䬙j穯顆첤鞅ల
only showing top 1 rows
-RECORD 0-----------------------------------------------------
id | 0
random_string | 箲䨂䡶栣䌳벣ሰ‸ꠂ昊룃쉳믠䭐沂各䬙j穯顆첤鞅ల
my_substring | 栣䌳벣ሰ‸
my_leftstring | 箲䨂䡶栣
my_rightstring | 顆첤鞅ల
my_concat | 栣䌳벣ሰ‸箲䨂䡶栣顆첤鞅ల
only showing top 1 rows
import scala.util.Random
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.unsafe.types.UTF8String
val df: org.apache.spark.sql.DataFrame = [id: bigint]
val dff: org.apache.spark.sql.DataFrame = [id: bigint, random_string: string]
val dfff: org.apache.spark.sql.DataFrame = [id: bigint, random_string:
string ... 4 more fields]
scala> dfff.show(1, 50, true)
| dfff.registerTempTable("mytable")
| spark.sql("select * from mytable").show(1)
| spark.sqlContext.setConf("spark.sql.collation.enabled", "true")
| spark.sql("select COLLATION( random_string), COLLATION(
my_substring), COLLATION( my_leftstring), COLLATION( my_rightstring),
COLLATION( my_concat) from mytable").show(1)
warning: 1 deprecation (since 2.0.0); for details, enable `:setting
-deprecation` or `:replay -deprecation`
-RECORD 0-----------------------------------------------------
id | 0
random_string | 箲䨂䡶栣䌳벣ሰ‸ꠂ昊룃쉳믠䭐沂各䬙j穯顆첤鞅ల
my_substring | 栣䌳벣ሰ‸
my_leftstring | 箲䨂䡶栣
my_rightstring | 顆첤鞅ల
my_concat | 栣䌳벣ሰ‸箲䨂䡶栣顆첤鞅ల
only showing top 1 rows
+---+---------------------------------+------------+-------------+--------------+-------------------------+
| id|
random_string|my_substring|my_leftstring|my_rightstring|
my_concat|
+---+---------------------------------+------------+-------------+--------------+-------------------------+
| 0|箲䨂䡶栣䌳벣ሰ‸ꠂ昊룃쉳믠䭐沂各...| 栣䌳벣ሰ‸| 箲䨂䡶栣| 顆첤鞅ల|栣䌳벣ሰ‸箲䨂䡶栣顆첤鞅ల|
+---+---------------------------------+------------+-------------+--------------+-------------------------+
only showing top 1 rows
+------------------------+-----------------------+------------------------+-------------------------+--------------------+
|collation(random_string)|collation(my_substring)|collation(my_leftstring)|collation(my_rightstring)|collation(my_concat)|
+------------------------+-----------------------+------------------------+-------------------------+--------------------+
| UTF8_BINARY| UTF8_BINARY| UTF8_BINARY|
UTF8_BINARY| UTF8_BINARY|
+------------------------+-----------------------+------------------------+-------------------------+--------------------+
only showing top 1 rows
scala> spark.sql("select collate(my_concat, 'unicode') as
unicode_myconcat, collate(my_concat, 'utf8_binary') as utf8_binary_myconcat,
collate(my_concat, 'utf8_binary_lcase') as utf8_binary_lcase_myconcat,
collate(my_concat, 'unicode_ci') as unicode_ci_myconcat from mytable").show(1)
|
+-------------------------+-------------------------+--------------------------+-------------------------+
| unicode_myconcat|
utf8_binary_myconcat|utf8_binary_lcase_myconcat| unicode_ci_myconcat|
+-------------------------+-------------------------+--------------------------+-------------------------+
|栣䌳벣ሰ‸箲䨂䡶栣顆첤鞅ల|栣䌳벣ሰ‸箲䨂䡶栣顆첤鞅ల| 栣䌳벣ሰ‸箲䨂䡶栣顆첤鞅ల|栣䌳벣ሰ‸箲䨂䡶栣顆첤鞅ల|
+-------------------------+-------------------------+--------------------------+-------------------------+
only showing top 1 rows
scala> spark.sql("select left(collate(my_concat, 'unicode'), 5) as
unicode_myconcat, right(collate(my_concat, 'utf8_binary'), 5) as
utf8_binary_myconcat, substr(collate(my_concat, 'utf8_binary_lcase'), 5, 5) as
utf8_binary_lcase_myconcat, left(collate(my_concat, 'unicode_ci'), 1) as
unicode_ci_myconcat
from mytable").show(1)
|
+----------------+--------------------+--------------------------+-------------------+
|unicode_myconcat|utf8_binary_myconcat|utf8_binary_lcase_myconcat|unicode_ci_myconcat|
+----------------+--------------------+--------------------------+-------------------+
| 栣䌳벣ሰ‸| 顆첤鞅ల| ‸箲䨂䡶| 栣|
+----------------+--------------------+--------------------------+-------------------+
only showing top 1 rows
scala> spark.sql("select COLLATION(left(collate(my_concat, 'unicode'),
5)) as unicode_myconcat, COLLATION(right(collate(my_concat, 'utf8_binary'), 5))
as utf8_binary_myconcat, COLLATION(substr(collate(my_concat,
'utf8_binary_lcase'), 5, 5)) as utf8_binary_lcase_myconcat,
COLLATION(left(collate(my_conc
at, 'unicode_ci'), 1)) as unicode_ci_myconcat from mytable").show(1)
|
+----------------+--------------------+--------------------------+-------------------+
|unicode_myconcat|utf8_binary_myconcat|utf8_binary_lcase_myconcat|unicode_ci_myconcat|
+----------------+--------------------+--------------------------+-------------------+
| UNICODE| UTF8_BINARY| UTF8_BINARY_LCASE|
UNICODE_CI|
+----------------+--------------------+--------------------------+-------------------+
only showing top 1 rows
scala> spark.sql("select COLLATION(left(collate(my_concat, 'unicode'),
5)) as unicode_myconcat, COLLATION(right(collate(my_concat, 'utf8_binary'), 5))
as utf8_binary_myconcat, COLLATION(substr(collate(my_concat,
'utf8_binary_lcase'), 5, 5)) as utf8_binary_lcase_myconcat,
COLLATION(left(collate(my_conca
t, 'unicode_ci'), 1)) as unicode_ci_myconcat from mytable").write
val res5: org.apache.spark.sql.DataFrameWriter[org.apache.spark.sql.Row] =
org.apache.spark.sql.DataFrameWriter@6dd31bab
scala> spark.sql("select left(collate(my_concat, 'unicode'), 5) as
unicode_myconcat, right(collate(my_concat, 'utf8_binary'), 5) as
utf8_binary_myconcat, substr(collate(my_concat, 'utf8_binary_lcase'), 5, 5) as
utf8_binary_lcase_myconcat, left(collate(my_concat, 'unicode_ci'), 1) as
unicode_ci_myconcat
from mytable")
| .write.mode("overwrite").parquet("mytable.parquet")
24/04/02 10:10:24 WARN MemoryManager: Total allocation exceeds 95.00%
(1,020,054,720 bytes) of heap memory
scala>
scala> val readdf = spark.read.parquet("mytable.parquet")
|
val readdf: org.apache.spark.sql.DataFrame = [unicode_myconcat: string
collate UNICODE, utf8_binary_myconcat: string ... 2 more fields]
scala> readdf.show(1, 50, true)
|
-RECORD 0------------------------------
unicode_myconcat | 栣䌳벣ሰ‸
utf8_binary_myconcat | 顆첤鞅ల
utf8_binary_lcase_myconcat | ‸箲䨂䡶
unicode_ci_myconcat | 栣
only showing top 1 rows
scala> readdf.createOrReplaceTempView("mytable2")
|
scala> spark.sql("select COLLATION(unicode_myconcat),
COLLATION(utf8_binary_myconcat), COLLATION(utf8_binary_lcase_myconcat),
COLLATION(unicode_ci_myconcat) from mytable2").show(1)
|
+---------------------------+-------------------------------+-------------------------------------+------------------------------+
|collation(unicode_myconcat)|collation(utf8_binary_myconcat)|collation(utf8_binary_lcase_myconcat)|collation(unicode_ci_myconcat)|
+---------------------------+-------------------------------+-------------------------------------+------------------------------+
| UNICODE| UTF8_BINARY|
UTF8_BINARY_LCASE| UNICODE_CI|
+---------------------------+-------------------------------+-------------------------------------+------------------------------+
only showing top 1 rows
scala>
:quit
gideon@Gideon's MacBook Pro spark % ./bin/pyspark
Python 3.9.6 (default, Feb 3 2024, 15:58:27)
[Clang 15.0.0 (clang-1500.3.9.4)] on darwin
>>> df = spark.range(1000 * 1000).toDF("id")
>>> import pyspark.sql.functions as F
>>> import random
>>> import string
>>> dff = df.withColumn("random_string",
F.lit("".join([random.choice(string.ascii_letters) for _ in range(25)])))
>>> dff.show(1, 100, True)
-RECORD 0----------------------------------
id | 0
random_string | fWuOlqjSthNLbHQUuxGizXuKX
only showing top 1 rows
>>> dfff = dff.withColumn("my_substring",
F.substring(F.col("random_string"), 5, 5)) \
... .withColumn("my_leftstring", F.left(F.col("random_string"),
F.lit(5))) \
... .withColumn("my_rightstring", F.right(F.col("random_string"),
F.lit(5))) \
... .withColumn("my_concat", F.concat(F.col("my_substring"),
F.col("my_leftstring"), F.col("my_rightstring")))
>>>
>>> dfff.show(10, 100, True)
-RECORD 0-----------------------------------
id | 0
random_string | fWuOlqjSthNLbHQUuxGizXuKX
my_substring | lqjSt
my_leftstring | fWuOl
my_rightstring | zXuKX
my_concat | lqjStfWuOlzXuKX
only showing top 1 rows
>>> dfff.createOrReplaceTempView("mytable")
>>> spark.sql("select * from mytable").show(1)
+---+--------------------+------------+-------------+--------------+---------------+
| id| random_string|my_substring|my_leftstring|my_rightstring|
my_concat|
+---+--------------------+------------+-------------+--------------+---------------+
| 0|fWuOlqjSthNLbHQUu...| lqjSt| fWuOl|
zXuKX|lqjStfWuOlzXuKX|
+---+--------------------+------------+-------------+--------------+---------------+
only showing top 1 rows
>>> spark.sql("select COLLATION( random_string), COLLATION( my_substring),
COLLATION( my_leftstring), COLLATION( my_rightstring), COLLATION( my_concat)
from mytable").show()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/gideon/repos/spark/python/pyspark/sql/session.py", line 1711,
in sql
return DataFrame(self._jsparkSession.sql(sqlQuery, litArgs), self)
File
"/Users/gideon/repos/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py",
line 1322, in __call__
File
"/Users/gideon/repos/spark/python/pyspark/errors/exceptions/captured.py", line
221, in deco
raise converted from None
pyspark.errors.exceptions.captured.AnalysisException:
[UNSUPPORTED_FEATURE.COLLATION] The feature is not supported: Collation is not
yet supported. SQLSTATE: 0A000;
Project [collation(random_string#4) AS collation(random_string)#107,
collation(my_substring#29) AS collation(my_substring)#108,
collation(my_leftstring#33) AS collation(my_leftstring)#109,
collation(my_rightstring#38) AS collation(my_rightstring)#110,
collation(my_concat#44) AS collation(my_concat)#111]
+- SubqueryAlias mytable
+- View (`mytable`, [id#2L, random_string#4, my_substring#29,
my_leftstring#33, my_rightstring#38, my_concat#44])
+- Project [id#2L, random_string#4, my_substring#29, my_leftstring#33,
my_rightstring#38, concat(my_substring#29, my_leftstring#33, my_rightstring#38)
AS my_concat#44]
+- Project [id#2L, random_string#4, my_substring#29,
my_leftstring#33, right(random_string#4, 5) AS my_rightstring#38]
+- Project [id#2L, random_string#4, my_substring#29,
left(random_string#4, 5) AS my_leftstring#33]
+- Project [id#2L, random_string#4,
substring(random_string#4, 5, 5) AS my_substring#29]
+- Project [id#2L, fWuOlqjSthNLbHQUuxGizXuKX AS
random_string#4]
+- Project [id#0L AS id#2L]
+- Range (0, 1000000, step=1, splits=Some(16))
>>> spark.conf.set("spark.sql.collation.enabled", "true")
>>> spark.sql("select COLLATION( random_string), COLLATION( my_substring),
COLLATION( my_leftstring), COLLATION( my_rightstring), COLLATION( my_concat)
from mytable").show(1)
+------------------------+-----------------------+------------------------+-------------------------+--------------------+
|collation(random_string)|collation(my_substring)|collation(my_leftstring)|collation(my_rightstring)|collation(my_concat)|
+------------------------+-----------------------+------------------------+-------------------------+--------------------+
| UTF8_BINARY| UTF8_BINARY| UTF8_BINARY|
UTF8_BINARY| UTF8_BINARY|
+------------------------+-----------------------+------------------------+-------------------------+--------------------+
only showing top 1 rows
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]