[ https://issues.apache.org/jira/browse/SPARK-53448?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Vindhya G updated SPARK-53448: ------------------------------ Description: Based on this thread in stack overflow https://stackoverflow.com/questions/79747926/conversion-of-a-pyspark-dataframe-with-a-variant-column-to-pandas-fails-with-an {code:java} {{from pyspark.sql import SparkSessionfrom pyspark.sql.functions import expr spark = SparkSession.builder.getOrCreate(); data = [ ('\{"name": "Alice", "age": 30, "city": "New York", "salary": 85000}',), ('\{"name": "Bob", "age": 25, "city": "San Francisco", "salary": 92000}',), ('\{"name": "Charlie", "age": 35, "city": "Los Angeles", "salary": 79000}',) ] df = spark.createDataFrame(data, ["json_data"]) df_variant = df.withColumn("variant_column", expr("parse_json(json_data)")) df_variant.printSchema() df_variant.show(truncate=False) pdf = df_variant.toPandas()print(pdf)}} {code} The below code throws below error. {noformat} File ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/sql/pandas/types.py:1090, in _create_converter_to_pandas.<locals>._converter.<locals>.convert_variant(value) 1088 return VariantVal(value["value"], value["metadata"]) 1089 else: -> 1090 raise PySparkValueError(errorClass="MALFORMED_VARIANT") File ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/exceptions/base.py:47, in PySparkException.__init__(self, message, errorClass, messageParameters, contexts) 44 self._error_reader = ErrorClassesReader() 46 if message is None: ---> 47 self._message = self._error_reader.get_error_message( 48 cast(str, errorClass), cast(Dict[str, str], messageParameters) 49 ) 50 else: 51 self._message = message File ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/utils.py:105, in ErrorClassesReader.get_error_message(self, errorClass, messageParameters) 103 # Verify message parameters. 104 message_parameters_from_template = re.findall("<([a-zA-Z0-9_-]+)>", message_template) --> 105 assert set(message_parameters_from_template) == set(messageParameters), ( 106 f"Undefined error message parameter for error class: {errorClass}. " 107 f"Parameters: {messageParameters}" 108 ) 110 def replace_match(match: Match[str]) -> str: 111 return match.group().translate(str.maketrans("<>", "{}")) TypeError: 'NoneType' object is not iterable {noformat} was: {code:java} {{from pyspark.sql import SparkSessionfrom pyspark.sql.functions import expr spark = SparkSession.builder.getOrCreate(); data = [ ('\{"name": "Alice", "age": 30, "city": "New York", "salary": 85000}',), ('\{"name": "Bob", "age": 25, "city": "San Francisco", "salary": 92000}',), ('\{"name": "Charlie", "age": 35, "city": "Los Angeles", "salary": 79000}',) ] df = spark.createDataFrame(data, ["json_data"]) df_variant = df.withColumn("variant_column", expr("parse_json(json_data)")) df_variant.printSchema() df_variant.show(truncate=False) pdf = df_variant.toPandas()print(pdf)}} {code} The below code throws below error. {noformat} File ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/sql/pandas/types.py:1090, in _create_converter_to_pandas.<locals>._converter.<locals>.convert_variant(value) 1088 return VariantVal(value["value"], value["metadata"]) 1089 else: -> 1090 raise PySparkValueError(errorClass="MALFORMED_VARIANT") File ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/exceptions/base.py:47, in PySparkException.__init__(self, message, errorClass, messageParameters, contexts) 44 self._error_reader = ErrorClassesReader() 46 if message is None: ---> 47 self._message = self._error_reader.get_error_message( 48 cast(str, errorClass), cast(Dict[str, str], messageParameters) 49 ) 50 else: 51 self._message = message File ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/utils.py:105, in ErrorClassesReader.get_error_message(self, errorClass, messageParameters) 103 # Verify message parameters. 104 message_parameters_from_template = re.findall("<([a-zA-Z0-9_-]+)>", message_template) --> 105 assert set(message_parameters_from_template) == set(messageParameters), ( 106 f"Undefined error message parameter for error class: {errorClass}. " 107 f"Parameters: {messageParameters}" 108 ) 110 def replace_match(match: Match[str]) -> str: 111 return match.group().translate(str.maketrans("<>", "{}")) TypeError: 'NoneType' object is not iterable {noformat} > Conversion of a pyspark DataFrame with a Variant column to pandas fails with > an error > ------------------------------------------------------------------------------------- > > Key: SPARK-53448 > URL: https://issues.apache.org/jira/browse/SPARK-53448 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 4.0.0 > Reporter: Vindhya G > Priority: Minor > > Based on this thread in stack overflow > https://stackoverflow.com/questions/79747926/conversion-of-a-pyspark-dataframe-with-a-variant-column-to-pandas-fails-with-an > {code:java} > {{from pyspark.sql import SparkSessionfrom pyspark.sql.functions import expr > spark = SparkSession.builder.getOrCreate(); > data = [ > ('\{"name": "Alice", "age": 30, "city": "New York", "salary": 85000}',), > ('\{"name": "Bob", "age": 25, "city": "San Francisco", "salary": > 92000}',), > ('\{"name": "Charlie", "age": 35, "city": "Los Angeles", "salary": > 79000}',) > ] > df = spark.createDataFrame(data, ["json_data"]) > df_variant = df.withColumn("variant_column", expr("parse_json(json_data)")) > df_variant.printSchema() > df_variant.show(truncate=False) > pdf = df_variant.toPandas()print(pdf)}} > {code} > The below code throws below error. > {noformat} > File > ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/sql/pandas/types.py:1090, > in > _create_converter_to_pandas.<locals>._converter.<locals>.convert_variant(value) > 1088 return VariantVal(value["value"], value["metadata"]) > 1089 else: > -> 1090 raise PySparkValueError(errorClass="MALFORMED_VARIANT") > File > ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/exceptions/base.py:47, > in PySparkException.__init__(self, message, errorClass, messageParameters, > contexts) > 44 self._error_reader = ErrorClassesReader() > 46 if message is None: > ---> 47 self._message = self._error_reader.get_error_message( > 48 cast(str, errorClass), cast(Dict[str, str], messageParameters) > 49 ) > 50 else: > 51 self._message = message > File > ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/utils.py:105, > in ErrorClassesReader.get_error_message(self, errorClass, messageParameters) > 103 # Verify message parameters. > 104 message_parameters_from_template = re.findall("<([a-zA-Z0-9_-]+)>", > message_template) > --> 105 assert set(message_parameters_from_template) == > set(messageParameters), ( > 106 f"Undefined error message parameter for error class: > {errorClass}. " > 107 f"Parameters: {messageParameters}" > 108 ) > 110 def replace_match(match: Match[str]) -> str: > 111 return match.group().translate(str.maketrans("<>", "{}")) > TypeError: 'NoneType' object is not iterable > {noformat} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org