[ 
https://issues.apache.org/jira/browse/SPARK-53448?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vindhya G updated SPARK-53448:
------------------------------
    Description: 
 Based on this thread in stack overflow 
https://stackoverflow.com/questions/79747926/conversion-of-a-pyspark-dataframe-with-a-variant-column-to-pandas-fails-with-an

{code:java}
{{from pyspark.sql import SparkSessionfrom pyspark.sql.functions import expr

spark = SparkSession.builder.getOrCreate();
data = [
    ('\{"name": "Alice", "age": 30, "city": "New York", "salary": 85000}',),
    ('\{"name": "Bob", "age": 25, "city": "San Francisco", "salary": 92000}',),
    ('\{"name": "Charlie", "age": 35, "city": "Los Angeles", "salary": 79000}',)
]
df = spark.createDataFrame(data, ["json_data"])
df_variant = df.withColumn("variant_column", expr("parse_json(json_data)"))
df_variant.printSchema()
df_variant.show(truncate=False)
pdf = df_variant.toPandas()print(pdf)}}
{code}

The below code throws below error. 

{noformat}
File 
~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/sql/pandas/types.py:1090,
 in 
_create_converter_to_pandas.<locals>._converter.<locals>.convert_variant(value)
   1088     return VariantVal(value["value"], value["metadata"])
   1089 else:
-> 1090     raise PySparkValueError(errorClass="MALFORMED_VARIANT")

File 
~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/exceptions/base.py:47,
 in PySparkException.__init__(self, message, errorClass, messageParameters, 
contexts)
     44 self._error_reader = ErrorClassesReader()
     46 if message is None:
---> 47     self._message = self._error_reader.get_error_message(
     48         cast(str, errorClass), cast(Dict[str, str], messageParameters)
     49     )
     50 else:
     51     self._message = message

File 
~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/utils.py:105,
 in ErrorClassesReader.get_error_message(self, errorClass, messageParameters)
    103 # Verify message parameters.
    104 message_parameters_from_template = re.findall("<([a-zA-Z0-9_-]+)>", 
message_template)
--> 105 assert set(message_parameters_from_template) == set(messageParameters), 
(
    106     f"Undefined error message parameter for error class: {errorClass}. "
    107     f"Parameters: {messageParameters}"
    108 )
    110 def replace_match(match: Match[str]) -> str:
    111     return match.group().translate(str.maketrans("<>", "{}"))

TypeError: 'NoneType' object is not iterable
{noformat}



  was:
 

{code:java}
{{from pyspark.sql import SparkSessionfrom pyspark.sql.functions import expr

spark = SparkSession.builder.getOrCreate();
data = [
    ('\{"name": "Alice", "age": 30, "city": "New York", "salary": 85000}',),
    ('\{"name": "Bob", "age": 25, "city": "San Francisco", "salary": 92000}',),
    ('\{"name": "Charlie", "age": 35, "city": "Los Angeles", "salary": 79000}',)
]
df = spark.createDataFrame(data, ["json_data"])
df_variant = df.withColumn("variant_column", expr("parse_json(json_data)"))
df_variant.printSchema()
df_variant.show(truncate=False)
pdf = df_variant.toPandas()print(pdf)}}
{code}

The below code throws below error. 

{noformat}
File 
~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/sql/pandas/types.py:1090,
 in 
_create_converter_to_pandas.<locals>._converter.<locals>.convert_variant(value)
   1088     return VariantVal(value["value"], value["metadata"])
   1089 else:
-> 1090     raise PySparkValueError(errorClass="MALFORMED_VARIANT")

File 
~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/exceptions/base.py:47,
 in PySparkException.__init__(self, message, errorClass, messageParameters, 
contexts)
     44 self._error_reader = ErrorClassesReader()
     46 if message is None:
---> 47     self._message = self._error_reader.get_error_message(
     48         cast(str, errorClass), cast(Dict[str, str], messageParameters)
     49     )
     50 else:
     51     self._message = message

File 
~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/utils.py:105,
 in ErrorClassesReader.get_error_message(self, errorClass, messageParameters)
    103 # Verify message parameters.
    104 message_parameters_from_template = re.findall("<([a-zA-Z0-9_-]+)>", 
message_template)
--> 105 assert set(message_parameters_from_template) == set(messageParameters), 
(
    106     f"Undefined error message parameter for error class: {errorClass}. "
    107     f"Parameters: {messageParameters}"
    108 )
    110 def replace_match(match: Match[str]) -> str:
    111     return match.group().translate(str.maketrans("<>", "{}"))

TypeError: 'NoneType' object is not iterable
{noformat}




> Conversion of a pyspark DataFrame with a Variant column to pandas fails with 
> an error
> -------------------------------------------------------------------------------------
>
>                 Key: SPARK-53448
>                 URL: https://issues.apache.org/jira/browse/SPARK-53448
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 4.0.0
>            Reporter: Vindhya G
>            Priority: Minor
>
>  Based on this thread in stack overflow 
> https://stackoverflow.com/questions/79747926/conversion-of-a-pyspark-dataframe-with-a-variant-column-to-pandas-fails-with-an
> {code:java}
> {{from pyspark.sql import SparkSessionfrom pyspark.sql.functions import expr
> spark = SparkSession.builder.getOrCreate();
> data = [
>     ('\{"name": "Alice", "age": 30, "city": "New York", "salary": 85000}',),
>     ('\{"name": "Bob", "age": 25, "city": "San Francisco", "salary": 
> 92000}',),
>     ('\{"name": "Charlie", "age": 35, "city": "Los Angeles", "salary": 
> 79000}',)
> ]
> df = spark.createDataFrame(data, ["json_data"])
> df_variant = df.withColumn("variant_column", expr("parse_json(json_data)"))
> df_variant.printSchema()
> df_variant.show(truncate=False)
> pdf = df_variant.toPandas()print(pdf)}}
> {code}
> The below code throws below error. 
> {noformat}
> File 
> ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/sql/pandas/types.py:1090,
>  in 
> _create_converter_to_pandas.<locals>._converter.<locals>.convert_variant(value)
>    1088     return VariantVal(value["value"], value["metadata"])
>    1089 else:
> -> 1090     raise PySparkValueError(errorClass="MALFORMED_VARIANT")
> File 
> ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/exceptions/base.py:47,
>  in PySparkException.__init__(self, message, errorClass, messageParameters, 
> contexts)
>      44 self._error_reader = ErrorClassesReader()
>      46 if message is None:
> ---> 47     self._message = self._error_reader.get_error_message(
>      48         cast(str, errorClass), cast(Dict[str, str], messageParameters)
>      49     )
>      50 else:
>      51     self._message = message
> File 
> ~/.pyenv/versions/3.11.9/lib/python3.11/site-packages/pyspark/errors/utils.py:105,
>  in ErrorClassesReader.get_error_message(self, errorClass, messageParameters)
>     103 # Verify message parameters.
>     104 message_parameters_from_template = re.findall("<([a-zA-Z0-9_-]+)>", 
> message_template)
> --> 105 assert set(message_parameters_from_template) == 
> set(messageParameters), (
>     106     f"Undefined error message parameter for error class: 
> {errorClass}. "
>     107     f"Parameters: {messageParameters}"
>     108 )
>     110 def replace_match(match: Match[str]) -> str:
>     111     return match.group().translate(str.maketrans("<>", "{}"))
> TypeError: 'NoneType' object is not iterable
> {noformat}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to