cstavr commented on code in PR #46280: URL: https://github.com/apache/spark/pull/46280#discussion_r1598693494
########## python/pyspark/sql/types.py: ########## @@ -1702,9 +1791,16 @@ def _parse_datatype_json_string(json_string: str) -> DataType: return _parse_datatype_json_value(json.loads(json_string)) -def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: +def _parse_datatype_json_value( + json_value: Union[dict, str], + fieldPath: str = "", + collationsMap: Optional[Dict[str, str]] = None, +) -> DataType: if not isinstance(json_value, dict): if json_value in _all_atomic_types.keys(): + if collationsMap is not None and fieldPath in collationsMap: + collationName = collationsMap[fieldPath].split(".")[1] Review Comment: So we are using only the name? What about the rest of the parts? ########## python/pyspark/sql/types.py: ########## @@ -263,21 +263,31 @@ def __init__(self, collation: Optional[str] = None): def fromCollationId(self, collationId: int) -> "StringType": return StringType(StringType.collationNames[collationId]) - def collationIdToName(self) -> str: - if self.collationId == 0: - return "" - else: - return " collate %s" % StringType.collationNames[self.collationId] + @classmethod + def collationIdToName(cls, collationId: int) -> str: + return StringType.collationNames[collationId] @classmethod def collationNameToId(cls, collationName: str) -> int: return StringType.collationNames.index(collationName) + @classmethod + def collationProvider(cls, collationName: str) -> str: + if collationName.startswith("UTF8"): + return "spark" + return "icu" + def simpleString(self) -> str: - return "string" + self.collationIdToName() + return ( + "string" + if self.isUTF8BinaryCollation() + else "string collate " + self.collationIdToName(self.collationId) + ) Review Comment: nit: Since this is multi-line anyway, I think that standard if/else block is better here. ########## python/pyspark/sql/types.py: ########## @@ -1756,6 +1854,15 @@ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: ) +def _parse_type_with_collation( + json_value: Dict[str, Any], fieldPath: str, collationsMap: Optional[Dict[str, str]] +) -> DataType: + if collationsMap and fieldPath in collationsMap: + collationName = collationsMap[fieldPath].split(".")[1] Review Comment: ditto ########## python/pyspark/sql/types.py: ########## @@ -1756,6 +1854,15 @@ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: ) +def _parse_type_with_collation( + json_value: Dict[str, Any], fieldPath: str, collationsMap: Optional[Dict[str, str]] +) -> DataType: Review Comment: Don't we also need the string type check here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org